In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
JFKridership = pd.read_csv('../../processedData/JFKVehicleByHour.csv')

In [3]:
JFKridership.head()

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,1,2018-01-01,0,1.0
1,2,2018-01-01,0,0.0
2,3,2018-01-01,0,0.0
3,4,2018-01-01,0,1.0
4,5,2018-01-01,0,0.0


In [4]:
# pivot table
JFKridership = pd.pivot_table(JFKridership, values='vehicle_count', index=['Date','Hour'],
                columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)

In [5]:
JFKridership.head()

Unnamed: 0_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-01,0,1,0,0,1,0,0,5,0,0,7,...,0,2,4,0,0,2,3,0,5,6
2018-01-01,1,0,0,1,1,0,0,1,0,1,4,...,0,2,2,1,1,0,2,0,0,1
2018-01-01,2,0,0,0,1,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,2,0
2018-01-01,3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,2,0,0,0,0,1
2018-01-01,4,0,0,0,0,0,0,0,0,0,3,...,0,0,1,0,1,0,0,0,0,0


In [6]:
# standardization
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
# dimension reduction
def getLinearPCAFeatures(matrix, n):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf
def inversePCA(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [8]:
standardscaler, matrix = standardize(JFKridership)

In [9]:
pca_comps = 5
linearPCA,linearPCAData = getLinearPCAFeatures(matrix,n=pca_comps)

In [10]:
# minimize the MSE between reversed PCA matrix and raw matrix
def reverse_MSE(estimator, X):
    X_reduced = estimator.transform(X)
    X_reverse = estimator.inverse_transform(X_reduced)
    return -1 * mean_squared_error(X, X_reverse)

In [11]:
# grid search to find the best gamma and kernel
param_grid = [{
    "gamma": np.linspace(0.01, 20, 10),
        "kernel": ["rbf", "sigmoid", "poly"]
}]
# fit_inverse_transform=True to make sure inverse transform available
kpca=KernelPCA(fit_inverse_transform=True,n_jobs=-1, n_components=pca_comps) 
grid_search = GridSearchCV(kpca, param_grid, cv=3, scoring=reverse_MSE)
grid_search.fit(matrix)

GridSearchCV(cv=3, error_score=nan,
             estimator=KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3,
                                 eigen_solver='auto',
                                 fit_inverse_transform=True, gamma=None,
                                 kernel='linear', kernel_params=None,
                                 max_iter=None, n_components=5, n_jobs=-1,
                                 random_state=None, remove_zero_eig=False,
                                 tol=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'gamma': array([1.00000000e-02, 2.23111111e+00, 4.45222222e+00, 6.67333333e+00,
       8.89444444e+00, 1.11155556e+01, 1.33366667e+01, 1.55577778e+01,
       1.77788889e+01, 2.00000000e+01]),
                          'kernel': ['rbf', 'sigmoid', 'poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=<function reverse_MSE at 0x7fbfdc315a70>, verbose=0)

In [12]:
grid_search.best_estimator_

KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
          fit_inverse_transform=True, gamma=0.01, kernel='sigmoid',
          kernel_params=None, max_iter=None, n_components=5, n_jobs=-1,
          random_state=None, remove_zero_eig=False, tol=0)

In [13]:
def getNonLinearPCAFeatures(transformer, matrix):
    reducedMatrixPCA = transformer.transform(matrix)
    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    return transformer,reducedDf

In [14]:
nonLinearPCA, nonLinearPCAData = getNonLinearPCAFeatures(grid_search.best_estimator_, matrix)

In [15]:
# add 12 lag value for each taxi zone
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [16]:
lagColumns = [str(i) for i in range(1,pca_comps+1)]
DateColumns = ['Date','Hour']
targetColumns = [str(i) for i in range(1,pca_comps+1)]

In [17]:
linearPCAData_lag = addLag(linearPCAData, 12, lagColumns)
nonlinearPCAData_lag = addLag(nonLinearPCAData, 12, lagColumns)

In [18]:
# linear PCA model training
X, y = linearPCAData_lag.drop(columns=lagColumns+DateColumns), linearPCAData_lag[lagColumns]

In [19]:
# train on the first 11 months, test on the last month
X_train,X_test = X.iloc[:8004], X.iloc[8004:]
y_train,y_test = y.iloc[:8004], y.iloc[8004:]

In [20]:
rf = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                           min_samples_split=3,
                           min_samples_leaf= 2, 
                           max_features= 'sqrt',
                           max_depth= None, 
                           bootstrap= False)
rf.fit(X_train,y_train)
print("Train R2: ",rf.score(X_train,y_train))
test_r2 = rf.score(X_test,y_test)
print("Test R2: ",test_r2)
pca_prediction = rf.predict(X_test)
taxizone_prediction = inversePCA(pca_prediction,linearPCA)

taxizone_prediction = inverse_standardize(taxizone_prediction, standardscaler)
print("Taxi zone leve test R2: ", r2_score(JFKridership.iloc[-744:], taxizone_prediction, multioutput='variance_weighted'))

Train R2:  0.9875490589417484
Test R2:  0.7950724466386395
Taxi zone leve test R2:  0.5505232677097203


In [21]:
# non linear PCA model training

In [22]:
X, y = nonlinearPCAData_lag.drop(columns=lagColumns+DateColumns), nonlinearPCAData_lag[lagColumns]
X_train,X_test = X.iloc[:8004], X.iloc[8004:]
y_train,y_test = y.iloc[:8004], y.iloc[8004:]

In [23]:
rf = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                           min_samples_split=3,
                           min_samples_leaf= 2, 
                           max_features= 'sqrt',
                           max_depth= None, 
                           bootstrap= False)
rf.fit(X_train,y_train)
print("Train R2: ",rf.score(X_train,y_train))
test_r2 = rf.score(X_test,y_test)
print("Test R2: ",test_r2)
pca_prediction = rf.predict(X_test)
taxizone_prediction = inversePCA(pca_prediction,grid_search.best_estimator_)

taxizone_prediction = inverse_standardize(taxizone_prediction, standardscaler)
print("Taxi zone leve test R2: ", r2_score(JFKridership.iloc[-744:], taxizone_prediction, multioutput='variance_weighted'))

Train R2:  0.9895053705297316
Test R2:  0.7983319267852139
Taxi zone leve test R2:  0.474134435314079
