In [45]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import TruncatedSVD
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour','Min'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)


In [33]:
# dimension reduction

def getSVDFeatures(transformer, matrix):
    reducedMatrix = transformer.transform(matrix)
    reducedDict = {str(i+1):reducedMatrix[:,i] for i in range(reducedMatrix.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    return reducedDf
# reconverse the PCA component back to raw dataset's dimentsion
def inverse_transformer(matrix,transformer):
    m = matrix.copy()
    return transformer.inverse_transform(m)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
# minimize the MSE between reversed PCA matrix and raw matrix
def reverse_MSE(estimator, X):
    X_reduced = estimator.transform(X)
    X_reverse = estimator.inverse_transform(X_reduced)
    return -1 * mean_squared_error(X, X_reverse)

#### Preparing Data

In [9]:
hub = 'JFK'
tune_hyp_params = False
pca_comps = 6
granularity = 30
granularity = str(granularity)+'Min'

In [10]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleBy'+granularity+'.csv'

In [11]:
# file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [12]:
data = loadData(file)

Raw shape:  (1931102, 5)
Days:  365


In [13]:
data = getTimeSeries(data)

In [14]:
data.shape

(17518, 259)

In [15]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Min,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2018-01-01,0,0,0,0,0,1,0,0,3,0,0,4,...,0,2,3,0,0,1,2,0,4,6
2018-01-01,0,30,1,0,0,0,0,0,2,0,0,3,...,0,0,1,0,0,1,1,0,1,0
2018-01-01,1,0,0,0,1,1,0,0,1,0,1,2,...,0,1,1,0,1,0,1,0,0,1
2018-01-01,1,30,0,0,0,0,0,0,0,0,0,2,...,0,1,1,1,0,0,1,0,0,0
2018-01-01,2,0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,21,30,0,0,0,1,0,0,9,0,0,3,...,0,7,2,5,2,0,1,1,5,9
2018-12-31,22,0,0,0,2,5,0,0,0,0,1,7,...,2,3,3,2,1,1,0,0,9,4
2018-12-31,22,30,0,0,1,0,0,0,2,0,0,5,...,0,4,5,2,1,1,0,3,4,5
2018-12-31,23,0,0,0,2,1,0,0,0,0,1,9,...,0,6,0,3,3,0,1,0,2,4


In [16]:
matrix = data.values.astype(np.float64)

In [17]:
scaler, s_matrix = standardize(matrix)

In [18]:
int(s_matrix.shape[0]*0.7)

12262

## SVD

In [35]:
# comparing in-sample SVD reconstruction performance
pca_comps = 6
svd = TruncatedSVD(n_components=pca_comps, n_iter=7, random_state=42)
svd.fit(s_matrix)
train_matrix = svd.transform(s_matrix)
train_matrix_re = inverse_transformer(train_matrix,svd)
r2_score(s_matrix,train_matrix_re,multioutput='variance_weighted')

0.23868181273799857

In [36]:
# comparing out-of-sample SVD reconstruction performance
svd = TruncatedSVD(n_components=pca_comps, n_iter=7, random_state=42)
train_size = int(s_matrix.shape[0]*0.7)
svd.fit(s_matrix[:train_size])
test_matrix = svd.transform(s_matrix[train_size:])
test_matrix_re = inverse_transformer(test_matrix,svd)
r2_score(s_matrix[train_size:],test_matrix_re,multioutput='variance_weighted')

0.24853967229786578

In [37]:
# comparing out-of-sample SVD inverse standardize reconstruction performance
svd = TruncatedSVD(n_components=pca_comps, n_iter=7, random_state=42)
train_size = int(matrix.shape[0]*0.7)
scaler_train, s_matrix_train = standardize(matrix[:train_size])
s_matrix_test = scaler_train.transform(matrix[train_size:])
svd.fit(s_matrix_train)
test_matrix = svd.transform(s_matrix_test)
test_matrix_re = inverse_transformer(test_matrix,svd)
test_matrix_re_re = inverse_standardize(test_matrix_re,scaler_train)
r2_score(matrix[train_size:],test_matrix_re_re,multioutput='variance_weighted')

0.5522144484059988

## PCA

In [39]:
# comparing in-sample PCA reconstruction performance
pca = PCA(n_components=pca_comps)
pca.fit(s_matrix)
train_matrix = pca.transform(s_matrix)
train_matrix_re = inverse_transformer(train_matrix,pca)
r2_score(s_matrix,train_matrix_re,multioutput='variance_weighted')

0.2386639388145758

In [40]:
# comparing out-of-sample PCA reconstruction performance
pca = PCA(n_components=pca_comps)
train_size = int(s_matrix.shape[0]*0.7)
pca.fit(s_matrix[:train_size])
test_matrix = pca.transform(s_matrix[train_size:])
test_matrix_re = inverse_transformer(test_matrix,svd)
r2_score(s_matrix[train_size:],test_matrix_re,multioutput='variance_weighted')

0.23273404350361954

In [42]:
# comparing out-of-sample SVD inverse standardize reconstruction performance
pca = PCA(n_components=pca_comps)
train_size = int(matrix.shape[0]*0.7)
scaler_train, s_matrix_train = standardize(matrix[:train_size])
s_matrix_test = scaler_train.transform(matrix[train_size:])
pca.fit(s_matrix_train)
test_matrix = pca.transform(s_matrix_test)
test_matrix_re = inverse_transformer(test_matrix,pca)
test_matrix_re_re = inverse_standardize(test_matrix_re,scaler_train)
r2_score(matrix[train_size:],test_matrix_re_re,multioutput='variance_weighted')

0.552239970122122

## nonlinear PCA

In [49]:
train_size = int(matrix.shape[0]*0.8)
val_size = int(train_size*0.2)

val_fold = list(-1*np.ones(train_size-val_size)) + list(np.zeros(val_size))

ps = PredefinedSplit(val_fold)

param_grid = [{'n_components':[pca_comps]
    "gamma": np.linspace(0.01, 10, 3),
        "kernel": ["linear", "sigmoid"]
}]
# fit_inverse_transform=True to make sure inverse transform available
kpca=KernelPCA(fit_inverse_transform=True,n_jobs=-1) 
pca_grid_search = GridSearchCV(kpca, param_grid, cv=ps, scoring=reverse_MSE)
pca_grid_search.fit(s_matrix)

kpca_best = pca_grid_search.best_estimator_

train_matrix = kpca_best.transform(s_matrix)
train_matrix_re = inverse_transformer(train_matrix,kpca_best)
r2_score(s_matrix,train_matrix_re,multioutput='variance_weighted')

0.9999999953550651

In [51]:
# out-of-sample
pca = kpca_best
train_size = int(s_matrix.shape[0]*0.7)
pca.fit(s_matrix[:train_size])
test_matrix = pca.transform(s_matrix[train_size:])
test_matrix_re = inverse_transformer(test_matrix,pca)
r2_score(s_matrix[train_size:],test_matrix_re,multioutput='variance_weighted')

0.9997148955117166

In [52]:
# comparing out-of-sample SVD inverse standardize reconstruction performance
pca = kpca_best
train_size = int(matrix.shape[0]*0.7)
scaler_train, s_matrix_train = standardize(matrix[:train_size])
s_matrix_test = scaler_train.transform(matrix[train_size:])
pca.fit(s_matrix_train)
test_matrix = pca.transform(s_matrix_test)
test_matrix_re = inverse_transformer(test_matrix,pca)
test_matrix_re_re = inverse_standardize(test_matrix_re,scaler_train)
r2_score(matrix[train_size:],test_matrix_re_re,multioutput='variance_weighted')

0.9999999846325396