In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf

In [8]:
def PCA_test(matrix, pca):

    reducedMatrixPCA = pca.transform(matrix)

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return reducedDf

In [9]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [10]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [11]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [12]:
def pca_performance(trainmatrix,testmatrix, components):
    rmseList = []
    r2List = []
    for n in components:
        scaler, s_train_matrix = standardize(trainmatrix)
        s_test_matrix = scaler.transform(testmatrix)

        pca,pcaTrain = getPCAFeatures(s_train_matrix,n=n)
        pcaTest = PCA_test(s_test_matrix, pca)
        
        network_prediction = inverse_pca(pcaTest,pca)
        network_prediction = inverse_standardize(network_prediction, scaler)

        r2Score = r2_score(testmatrix, network_prediction, multioutput='variance_weighted')
                
        r2List.append(r2Score)
    
    return r2List

In [13]:
def nonlinearperformance(trainmatrix,testmatrix,components, maxlag=12):
    r2List = []
    for n in components:
        print(n)
        scaler, s_train_matrix = standardize(trainmatrix)
        s_test_matrix = scaler.transform(testmatrix)

        pca,pcaTrain = getPCAFeatures(s_train_matrix,n=n)
        pcaTest = PCA_test(s_test_matrix, pca)

#         maxlag = 12
        DateColumns = ['Date', 'Hour']
        lagColumns = [c for c in pcaTrain.columns if c not in DateColumns]

        dataset_train = addLag(pcaTrain, maxlag)

        dataset_test = addLag(pcaTest, maxlag)

        X_train = dataset_train.drop(lagColumns , axis = 1)
        X_test = dataset_test.drop(lagColumns , axis = 1)
        y_train = dataset_train[lagColumns]
        y_test = dataset_test[lagColumns]
#         print(X_train.shape)
#         print(X_test.shape)
#         print(y_train.shape)
#         print(y_test.shape)

        rf2 = RandomForestRegressor(random_state = 0, n_estimators=200, 
                                   min_samples_split=10,
                                   min_samples_leaf= 3, 
                                   max_features= 'sqrt',
                                   max_depth= 30, 
                                   bootstrap= True)

        rf2.fit(X_train,y_train)

        pca_prediction = rf2.predict(X_test)

        network_prediction = inverse_pca(pca_prediction,pca)

        network_prediction = inverse_standardize(network_prediction, scaler)

        r2Score = r2_score(testmatrix[maxlag:], network_prediction, \
                           multioutput='variance_weighted')
        
        r2List.append(r2Score)
    return r2List

#### Preparing Data

In [134]:
hub = 'EWR'
tune_hyp_params = False
pca_comps = 24

In [135]:
dataDir = '/Users/hemingyi/Documents/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehiceByHour.csv'

In [136]:
# file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [137]:
data = loadData(file)

Raw shape:  (2260080, 4)
Days:  365


In [138]:
data = getTimeSeries(data)

In [139]:
data.shape

(8760, 258)

In [140]:
matrix = data.values.astype(np.float64)

In [141]:
scaler, s_matrix = standardize(matrix)

In [142]:
pca,pcaData = getPCAFeatures(s_matrix,n=pca_comps)

In [143]:
pcaData.index = data.index
pcaData = pcaData.reset_index()

In [144]:
pcaData.to_csv('/Users/hemingyi/Documents/UrbanTemporalNetworks/processedData/%spca%s.csv'%(hub.upper(),pca_comps),
               index=False)

In [145]:
externalDataDir = "/Users/hemingyi/Documents/UrbanTemporalNetworks/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [146]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,ifsat,...,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow,arrival
0,18/1/1 0:00,263,174,437,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,24.0
1,18/1/1 1:00,138,133,271,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,9.0


In [147]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,ifsat,...,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow,arrival
0,2018-01-01 00:00:00,263,174,437,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,24.0
1,2018-01-01 01:00:00,138,133,271,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,9.0


In [148]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [149]:
extDf.columns

Index(['date', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue', 'ifwed', 'ifthu',
       'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3', 'if4', 'if5',
       'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12', 'if13', 'if14',
       'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21', 'if22', 'if23',
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow', 'arrival', 'Hour',
       'Dow', 'Date'],
      dtype='object')

In [150]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [151]:
extDf = extDf[selected_columns]

In [152]:
print(pcaData.shape)
print(extDf.shape)

(8760, 26)
(8760, 14)


In [153]:
pcaData['Date'] = pd.to_datetime(pcaData['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [154]:
pcaData = pd.merge(pcaData,extDf, on=['Date', 'Hour'], how='inner')
print(pcaData.shape)

(8760, 38)


In [155]:
pcaData.columns

Index(['Date', 'Hour', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', 'Dow', 'arrival', 'maxtemp', 'mintemp', 'avgtemp', 'departure',
       'hdd', 'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [156]:
lagColumns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', 'arrival']
# lagColumns = ['1', '2', '3', 'arrival']

DateColumns = ['Date']

targetColumns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24']

In [157]:
maxlag = 12

pcaData_lag = addLag(pcaData, maxlag, lagColumns)

pcaData_lag.shape

(8748, 338)

In [158]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
rawList = []

for m in range(1,13):
    print()

    print("month: ",m)
    month_index  = pd.to_datetime(pcaData_lag.Date).dt.month == m

    dataset_train = pcaData_lag[~month_index]
    dataset_test = pcaData_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]



    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    pca_prediction = rf2.predict(X_test)

    residual = y_test - pca_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    network_prediction = inverse_pca(pca_prediction,pca)

    network_prediction = inverse_standardize(network_prediction, scaler)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_r2 = r2_score(data[edgeMonthIndex], network_prediction, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)


    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)
#     rawList.append()


month:  1
Train Size:  (8016, 338)
Test Size:  (732, 338)
Train R2:  0.7862261629385332
Test R2:  0.054884631971513484
Edge R2:  0.01736495469004738

month:  2
Train Size:  (8076, 338)
Test Size:  (672, 338)
Train R2:  0.7856854438691951
Test R2:  0.03946819189453797
Edge R2:  0.012366864224609428

month:  3
Train Size:  (8004, 338)
Test Size:  (744, 338)
Train R2:  0.787656000951012
Test R2:  0.02037542217941881
Edge R2:  0.015026476152940215

month:  4
Train Size:  (8028, 338)
Test Size:  (720, 338)
Train R2:  0.7891314965797694
Test R2:  0.024589467587183025
Edge R2:  0.019474267971746503

month:  5
Train Size:  (8004, 338)
Test Size:  (744, 338)
Train R2:  0.7871351366512753
Test R2:  0.04101898313163828
Edge R2:  0.021293560788685384

month:  6
Train Size:  (8028, 338)
Test Size:  (720, 338)
Train R2:  0.7904767458924148
Test R2:  0.012212570911572377
Edge R2:  0.01374040609392579

month:  7
Train Size:  (8004, 338)
Test Size:  (744, 338)
Train R2:  0.7872408635028922
Test R2:  0

In [159]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.030555864948277927
0.016903956456758008


In [160]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 26)


Unnamed: 0,Date,Hour,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
12,2018-01-01,12,0.428823,-0.279002,0.09568,0.092272,-1.149118,-0.393432,0.647156,0.189843,...,0.655442,1.201777,-0.265939,2.096713,-2.68059,0.07428,1.306719,-1.936028,1.101629,0.424078
13,2018-01-01,13,1.377351,-0.386969,-0.050576,0.32585,-0.650234,0.246586,0.357306,-1.081569,...,-0.333504,0.827116,-0.445108,-0.999067,0.582147,-0.557379,-0.142279,-0.269715,-0.851183,0.774503
14,2018-01-01,14,6.566749,0.206498,0.612108,2.013561,-0.157944,0.936609,-1.448901,0.587427,...,-1.723026,-1.389468,-0.377597,-1.022335,-0.624486,1.862095,-0.862569,1.339937,-1.664589,-3.666789
15,2018-01-01,15,0.604799,-0.518688,0.631916,0.973558,-1.694343,-0.427569,0.866404,-0.263501,...,1.000947,0.791932,2.446072,-0.504981,0.413774,0.520218,2.076542,0.372014,-0.395276,0.340551
16,2018-01-01,16,4.132373,8.114114,21.372266,2.007565,6.393181,-8.454324,-3.636599,-1.072241,...,-3.382328,0.424063,-0.46273,-3.011151,1.718864,-5.170183,2.243777,-4.905481,0.73497,0.706662


In [161]:
res_df.to_csv('/Users/hemingyi/Documents/UrbanTemporalNetworks/Resid/%sPCA24RFCV.csv'%hub.upper())