In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour','Min'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)


In [54]:
# dimension reduction
def getPCAFeatures(transformer, matrix):
    reducedMatrixPCA = transformer.transform(matrix)
    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    return reducedDf
# reconverse the PCA component back to raw dataset's dimentsion
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
# minimize the MSE between reversed PCA matrix and raw matrix
def reverse_MSE(estimator, X):
    X_reduced = estimator.transform(X)
    X_reverse = estimator.inverse_transform(X_reduced)
    return -1 * mean_squared_error(X, X_reverse)

#### Preparing Data

In [59]:
hub = 'JFK'
tune_hyp_params = False
pca_comps = 6
granularity = 30
granularity = str(granularity)+'Min'

In [60]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleBy'+granularity+'.csv'

In [61]:
# file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [62]:
data = loadData(file)

Raw shape:  (1931102, 5)
Days:  365


In [63]:
data = getTimeSeries(data)

In [64]:
data.shape

(17518, 259)

In [65]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Min,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2018-01-01,0,0,0,0,0,1,0,0,3,0,0,4,...,0,2,3,0,0,1,2,0,4,6
2018-01-01,0,30,1,0,0,0,0,0,2,0,0,3,...,0,0,1,0,0,1,1,0,1,0
2018-01-01,1,0,0,0,1,1,0,0,1,0,1,2,...,0,1,1,0,1,0,1,0,0,1
2018-01-01,1,30,0,0,0,0,0,0,0,0,0,2,...,0,1,1,1,0,0,1,0,0,0
2018-01-01,2,0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,21,30,0,0,0,1,0,0,9,0,0,3,...,0,7,2,5,2,0,1,1,5,9
2018-12-31,22,0,0,0,2,5,0,0,0,0,1,7,...,2,3,3,2,1,1,0,0,9,4
2018-12-31,22,30,0,0,1,0,0,0,2,0,0,5,...,0,4,5,2,1,1,0,3,4,5
2018-12-31,23,0,0,0,2,1,0,0,0,0,1,9,...,0,6,0,3,3,0,1,0,2,4


In [66]:
matrix = data.values.astype(np.float64)

In [67]:
scaler, s_matrix = standardize(matrix)

In [68]:
pca = KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
          fit_inverse_transform=True, gamma=0.01, kernel='sigmoid',
          kernel_params=None, max_iter=None, n_components=pca_comps, n_jobs=-1,
          random_state=None, remove_zero_eig=False, tol=0).fit(s_matrix)

In [69]:
pcaData = getPCAFeatures(pca,s_matrix)

In [70]:
pcaData.index = data.index
pcaData = pcaData.reset_index()

pcaData.to_csv('../../processedData/%spca%s.csv'%(hub.upper(),pca_comps),
               index=False)

In [71]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"+granularity+'/'
extFile = externalDataDir + hub.upper() + ".csv"

In [72]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(17520, 48)


Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [73]:
extDf['Date'] = pd.to_datetime(extDf['Date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [74]:
extDf.columns

Index(['Date', 'Hour', 'Min', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon',
       'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1',
       'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11',
       'if12', 'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20',
       'if21', 'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure',
       'hdd', 'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [75]:
selected_columns = ['Date', 'Hour', 'Min', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [76]:
extDf = extDf[selected_columns]

In [77]:
print(pcaData.shape)
print(extDf.shape)

(17518, 9)
(17520, 14)


In [78]:
pcaData['Date'] = pd.to_datetime(pcaData['Date'])
extDf['Date'] = extDf['Date'].dt.date
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [79]:
pcaData.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])

Unnamed: 0,Date,Hour,Min,1,2,3,4,5,6,arrival,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,-0.120959,0.347501,0.213932,0.044442,-0.002795,0.003428,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
1,2018-01-01,0,30,-0.286251,0.250147,0.076283,0.030015,-0.076420,-0.028383,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
2,2018-01-01,1,0,-0.345333,0.359496,-0.011247,0.155441,-0.004146,-0.163189,4.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
3,2018-01-01,1,30,-0.578451,0.116671,0.040158,0.061510,-0.056087,-0.103786,2.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
4,2018-01-01,2,0,-0.650860,0.093662,0.083709,0.053547,-0.050376,-0.071312,1.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,0.526168,0.352591,0.166252,-0.048757,-0.064496,-0.041423,7.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17514,2018-12-31,22,0,0.317982,0.349276,0.070934,0.004123,-0.000938,0.058431,6.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17515,2018-12-31,22,30,0.398356,0.398542,0.111605,0.010626,-0.016762,-0.081417,8.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17516,2018-12-31,23,0,0.274428,0.464677,0.080985,0.144379,0.103649,-0.023449,2.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0


In [80]:
pcaData = pcaData.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])
print(pcaData.shape)

(17518, 20)


In [81]:
lagColumns = [str(i) for i in range(1,pca_comps+1)]+['arrival']

DateColumns = ['Date','Hour','Min']

targetColumns = [str(i) for i in range(1,pca_comps+1)]

In [82]:
pcaData

Unnamed: 0,Date,Hour,Min,1,2,3,4,5,6,arrival,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,-0.120959,0.347501,0.213932,0.044442,-0.002795,0.003428,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
1,2018-01-01,0,30,-0.286251,0.250147,0.076283,0.030015,-0.076420,-0.028383,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
2,2018-01-01,1,0,-0.345333,0.359496,-0.011247,0.155441,-0.004146,-0.163189,4.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
3,2018-01-01,1,30,-0.578451,0.116671,0.040158,0.061510,-0.056087,-0.103786,2.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
4,2018-01-01,2,0,-0.650860,0.093662,0.083709,0.053547,-0.050376,-0.071312,1.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,0.526168,0.352591,0.166252,-0.048757,-0.064496,-0.041423,7.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17514,2018-12-31,22,0,0.317982,0.349276,0.070934,0.004123,-0.000938,0.058431,6.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17515,2018-12-31,22,30,0.398356,0.398542,0.111605,0.010626,-0.016762,-0.081417,8.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17516,2018-12-31,23,0,0.274428,0.464677,0.080985,0.144379,0.103649,-0.023449,2.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0


In [83]:
maxlag = int(12 * 60 / int(granularity.replace('Min','')))

pcaData_lag = addLag(pcaData, maxlag, lagColumns)

pcaData_lag.shape

(17494, 188)

In [84]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
rawList = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()

    print("month: ",m)
    month_index  = pd.to_datetime(pcaData_lag.Date).dt.month == m

    dataset_train = pcaData_lag[~month_index]
    dataset_test = pcaData_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]



    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(sparse.csr_matrix(X_train.values),y_train))
    test_r2 = rf2.score(sparse.csr_matrix(X_test.values),y_test)
    print("Test R2: ",test_r2)


    pca_prediction = rf2.predict(X_test)

    residual = y_test - pca_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    network_prediction = inverse_pca(pca_prediction,pca)

    network_prediction = inverse_standardize(network_prediction, scaler)
    
    # relu to convert all prediction to positive
#     network_prediction = np.log(1+np.e**network_prediction)
    # round up negative values to 0
#     network_prediction = np.where(network_prediction<0,0,network_prediction)
    network_prediction_df = pd.DataFrame(network_prediction)
    network_prediction_df.columns = data.columns
    networkPrediction = pd.concat([networkPrediction,network_prediction_df])
    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_r2 = r2_score(data[edgeMonthIndex], network_prediction )
    print("Edge R2: ",edge_r2)


    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)
#     rawList.append()


month:  1
Train Size:  (16030, 188)
Test Size:  (1464, 188)
Train R2:  0.9870966518499985
Test R2:  0.7397776462580026
Edge R2:  0.15114709953044506

month:  2
Train Size:  (16150, 188)
Test Size:  (1344, 188)
Train R2:  0.987017985516557
Test R2:  0.747752422137109
Edge R2:  0.15168553626020698

month:  3
Train Size:  (16008, 188)
Test Size:  (1486, 188)
Train R2:  0.9869637286817732
Test R2:  0.7941678347624082
Edge R2:  0.1663212629767465

month:  4
Train Size:  (16054, 188)
Test Size:  (1440, 188)
Train R2:  0.987067744258504
Test R2:  0.7802747176522953
Edge R2:  0.1649149460073631

month:  5
Train Size:  (16006, 188)
Test Size:  (1488, 188)
Train R2:  0.9868901778798285
Test R2:  0.7960005820075914
Edge R2:  0.17520539748544584

month:  6
Train Size:  (16054, 188)
Test Size:  (1440, 188)
Train R2:  0.9869371722913928
Test R2:  0.7977122814361552
Edge R2:  0.16750485141221455

month:  7
Train Size:  (16006, 188)
Test Size:  (1488, 188)
Train R2:  0.9870103123559326
Test R2:  0.77

In [88]:
networkPrediction

DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
0,0.467007,0.001387,0.135654,0.788588,0.016853,0.042501,1.992472,0.010084,0.251975,2.746580,...,0.177290,3.254401,1.925666,0.528062,0.747936,0.194188,0.568124,1.656112,1.777147,2.342203
1,0.496619,0.001359,0.148969,0.941622,0.018339,0.048458,2.314803,0.012005,0.286393,3.063131,...,0.192543,3.820523,2.284400,0.616357,0.807786,0.215879,0.627682,1.868572,2.104597,2.794056
2,0.552435,0.001365,0.168119,1.106370,0.020873,0.056450,2.680822,0.014253,0.332124,3.458290,...,0.216465,4.495035,2.686951,0.717449,0.889863,0.247838,0.703425,2.183719,2.474121,3.292610
3,0.583054,0.001396,0.176926,1.227481,0.022309,0.061040,2.928313,0.015726,0.357525,3.688727,...,0.226099,4.987013,2.978170,0.789492,0.922696,0.264689,0.749167,2.406220,2.746778,3.663108
4,0.667407,0.001484,0.200204,1.382868,0.025601,0.070361,3.294380,0.017819,0.412923,4.183858,...,0.257913,5.682538,3.373946,0.891975,1.030179,0.305701,0.844035,2.791285,3.098455,4.132913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,0.542951,0.001002,0.241590,1.550780,0.027268,0.070032,3.737074,0.018453,0.471442,5.001691,...,0.311163,5.680105,3.669063,0.984868,1.355446,0.345185,0.971981,2.326754,3.241363,4.427534
1484,0.515332,0.000976,0.235014,1.474460,0.026137,0.066289,3.572406,0.017557,0.454290,4.868810,...,0.304213,5.360389,3.484147,0.938279,1.334898,0.332569,0.935088,2.170208,3.068115,4.191412
1485,0.462179,0.000920,0.218348,1.308690,0.023530,0.059250,3.217525,0.015454,0.411715,4.478467,...,0.283633,4.663871,3.076921,0.835365,1.264043,0.302472,0.862425,1.830685,2.691797,3.684833
1486,0.463645,0.000928,0.217009,1.277369,0.023531,0.058730,3.159303,0.015097,0.406356,4.406742,...,0.282088,4.565506,3.007185,0.818763,1.253318,0.300392,0.855242,1.811979,2.630191,3.597464


In [89]:
data.reset_index().iloc[maxlag:]

DOLocationID,Date,Hour,Min,1,2,3,4,5,6,7,...,254,255,256,257,258,259,260,261,262,263
24,2018-01-01,12,0,0,0,0,0,0,0,2,...,0,4,0,1,2,0,1,1,3,5
25,2018-01-01,12,30,1,0,0,2,0,0,4,...,0,1,3,0,0,0,1,1,4,8
26,2018-01-01,13,0,0,0,0,1,1,0,1,...,0,3,1,0,0,0,3,1,1,1
27,2018-01-01,13,30,1,0,0,1,0,0,2,...,0,3,1,3,0,0,0,1,7,3
28,2018-01-01,14,0,1,0,0,2,0,1,4,...,0,8,10,0,1,0,0,3,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,0,0,0,1,0,0,9,...,0,7,2,5,2,0,1,1,5,9
17514,2018-12-31,22,0,0,0,2,5,0,0,0,...,2,3,3,2,1,1,0,0,9,4
17515,2018-12-31,22,30,0,0,1,0,0,0,2,...,0,4,5,2,1,1,0,3,4,5
17516,2018-12-31,23,0,0,0,2,1,0,0,0,...,0,6,0,3,3,0,1,0,2,4


In [43]:
networkPrediction['Date'] = data.reset_index().iloc[maxlag:]['Date'].values
networkPrediction['Hour'] = data.reset_index().iloc[maxlag:]['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)

In [90]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.7798155583935585
0.16654425971215614


In [45]:
res_df = pd.concat(residualDf_list, axis = 0)
res_df.to_csv('../../Resid/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)