In [55]:
import pandas as pd
import geopandas as gpd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [109]:
def getTimeSeries(df,zones):
    df = df.loc[df['DOLocationID'].isin(zones)]
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
#     table.columns = [i[1] for i in table.columns]
    missing_columns = [i for i in zones if i not in table.columns]
    for col in missing_columns:
        table[col] = 0
    table = table[sorted(table.columns)]
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf

In [8]:
def PCA_test(matrix, pca):

    reducedMatrixPCA = pca.transform(matrix)

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return reducedDf

In [9]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [10]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [11]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### train PCA from 2017

In [65]:
zone = gpd.read_file('../../Data/NYC Taxi Zones.geojson')
zones = zone['location_id'].unique()
zones = [int(i) for i in zones]

In [59]:
len(zones)

260

In [137]:
hub = 'JFK'
pca_comps = 6
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleByHour2017.csv'

In [140]:
data2017 = loadData(file)
data2017 = getTimeSeries(data2017,zones)
data2017.head()

Raw shape:  (923971, 4)
Days:  365


Unnamed: 0_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-01-01,0,0,0,0,1,0,0,2,0,0,6,...,0,3,0,2,0,0,1,0,2,0
2017-01-01,1,0,0,0,1,0,0,3,0,0,5,...,0,3,1,0,0,0,0,1,2,3
2017-01-01,2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0
2017-01-01,3,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2017-01-01,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
matrix2017 = data2017.values.astype(np.float64)
scaler, s_matrix2017 = standardize(matrix2017)
pca,pca_data2017 = getPCAFeatures(s_network_matrix,n=pca_comps)

In [142]:
r2_score(s_network_matrix,pca.inverse_transform(pca_network_data))

0.6871691679742667

#### Preparing Data

In [143]:
hub = 'JFK'
file = dataDir + hub + 'VehicleByHour.csv'

In [144]:
data = loadData(file)
data = getTimeSeries(data,zones)
data.shape

Raw shape:  (2260080, 4)
Days:  365


(8760, 260)

In [145]:
data

Unnamed: 0_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-01,0,1,0,0,1,0,0,5,0,0,7,...,0,2,4,0,0,2,3,0,5,6
2018-01-01,1,0,0,1,1,0,0,1,0,1,4,...,0,2,2,1,1,0,2,0,0,1
2018-01-01,2,0,0,0,1,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,2,0
2018-01-01,3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,2,0,0,0,0,1
2018-01-01,4,0,0,0,0,0,0,0,0,0,3,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,19,1,0,0,4,0,1,6,0,1,11,...,1,10,6,4,1,1,3,4,10,16
2018-12-31,20,4,0,0,0,1,0,11,0,0,19,...,0,14,12,3,1,1,6,3,12,14
2018-12-31,21,0,0,0,2,0,0,11,0,0,6,...,0,12,5,6,4,0,1,4,8,14
2018-12-31,22,0,0,3,5,0,0,2,0,1,12,...,2,7,8,4,2,2,0,3,13,9


In [167]:
matrix = data.values.astype(np.float64)
scaler, s_matrix = standardize(matrix)
pcaData = pd.DataFrame(pca.transform(s_matrix),columns=[str(i) for i in range(1,pca_comps+1)])

In [168]:
r2_score(s_matrix,pca.inverse_transform(pcaData))

0.2545875623420312

In [169]:
pcaData.index = data.index
pcaData = pcaData.reset_index()

In [170]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [171]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,18/1/1 0:00,6,263,174,437,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,18/1/1 1:00,6,138,133,271,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [172]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01 00:00:00,6,263,174,437,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01 01:00:00,6,138,133,271,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [173]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [174]:
extDf.columns

Index(['date', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue',
       'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3',
       'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12',
       'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21',
       'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'Hour', 'Dow',
       'Date'],
      dtype='object')

In [175]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [176]:
extDf = extDf[selected_columns]

In [177]:
print(pcaData.shape)
print(extDf.shape)

(8760, 8)
(8760, 14)


In [178]:
pcaData['Date'] = pd.to_datetime(pcaData['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [179]:
pcaData = pd.merge(pcaData,extDf, on=['Date', 'Hour'], how='inner')
print(pcaData.shape)

(8760, 20)


In [180]:
pcaData.columns

Index(['Date', 'Hour', '1', '2', '3', '4', '5', '6', 'Dow', 'arrival',
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [181]:
lagColumns = ['1', '2', '3', '4', '5', '6', 'arrival']

DateColumns = ['Date']

targetColumns = ['1', '2', '3', '4', '5', '6']

In [182]:
maxlag = 12

pcaData_lag = addLag(pcaData, maxlag, lagColumns)

pcaData_lag.shape

(8748, 104)

In [183]:
data.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            254, 255, 256, 257, 258, 259, 260, 261, 262, 263],
           dtype='int64', name='DOLocationID', length=260)

In [184]:
pcaData_lag

Unnamed: 0,Date,Hour,1,2,3,4,5,6,Dow,arrival,...,5_lag_11,6_lag_11,arrival_lag_11,1_lag_12,2_lag_12,3_lag_12,4_lag_12,5_lag_12,6_lag_12,arrival_lag_12
12,2018-01-01,12,-1.110305,-1.342545,-0.623155,-0.558908,1.306811,-0.042053,0,10,...,-3.088888,-0.873562,6.0,-2.419313,4.063260,2.199953,0.891153,0.072451,0.887434,6.0
13,2018-01-01,13,0.854118,-1.361110,-1.258472,-1.013831,0.070432,0.525363,0,18,...,-3.527314,-1.457569,2.0,-5.001748,5.086650,1.925587,2.254057,-3.088888,-0.873562,6.0
14,2018-01-01,14,5.228073,-5.854057,-3.593858,-1.895864,3.079875,0.651756,0,19,...,-3.275441,-1.845805,0.0,-10.426646,4.205531,1.816411,0.904384,-3.527314,-1.457569,2.0
15,2018-01-01,15,6.247122,-5.618696,-4.819201,0.753751,4.695155,3.000407,0,28,...,-2.832837,-2.115050,2.0,-11.093022,4.458995,2.105157,0.659788,-3.275441,-1.845805,0.0
16,2018-01-01,16,4.576129,-7.912886,-2.708895,-0.506824,2.972346,2.339817,0,15,...,0.681130,0.895215,16.0,-10.572210,3.928386,2.443999,0.369240,-2.832837,-2.115050,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,19,10.747024,-0.317093,-0.482881,1.500272,3.569166,3.203719,0,12,...,2.129562,2.083073,13.0,1.058712,-0.326362,-0.816617,1.966757,1.098893,-0.036642,17.0
8756,2018-12-31,20,6.809809,2.038721,-1.794762,0.826252,3.216569,3.084644,0,19,...,0.402391,-0.957854,16.0,3.709604,-1.551246,-0.401087,1.153687,2.129562,2.083073,13.0
8757,2018-12-31,21,7.405710,1.702411,0.760854,3.809354,5.486548,2.297620,0,20,...,0.938336,-1.157004,13.0,-1.477121,1.809927,1.899087,0.690607,0.402391,-0.957854,16.0
8758,2018-12-31,22,7.528901,2.169684,3.911510,2.701177,3.873778,2.316298,0,16,...,0.557315,-0.051526,9.0,-1.452640,0.393830,0.929880,1.803023,0.938336,-1.157004,13.0


In [185]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
rawList = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()

    print("month: ",m)
    month_index  = pd.to_datetime(pcaData_lag.Date).dt.month == m

    dataset_train = pcaData_lag[~month_index]
    dataset_test = pcaData_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]



    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    pca_prediction = rf2.predict(X_test)

    residual = y_test - pca_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    network_prediction = inverse_pca(pca_prediction,pca)

    network_prediction = inverse_standardize(network_prediction, scaler)
    
    # relu to convert all prediction to positive
#     network_prediction = np.log(1+np.e**network_prediction)
    # round up negative values to 0
#     network_prediction = np.where(network_prediction<0,0,network_prediction)
    network_prediction_df = pd.DataFrame(network_prediction)
    network_prediction_df.columns = data.columns
    networkPrediction = pd.concat([networkPrediction,network_prediction_df])
    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_r2 = r2_score(data[edgeMonthIndex], network_prediction, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)


    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)
#     rawList.append()


month:  1
Train Size:  (8016, 104)
Test Size:  (732, 104)
Train R2:  0.9855165191428702
Test R2:  0.7692240031897699
Edge R2:  0.4044661954613551

month:  2
Train Size:  (8076, 104)
Test Size:  (672, 104)
Train R2:  0.9853125744135589
Test R2:  0.7761582571298322
Edge R2:  0.44648954318070294

month:  3
Train Size:  (8004, 104)
Test Size:  (744, 104)
Train R2:  0.9851496680882261
Test R2:  0.8123203194920439
Edge R2:  0.4692988337279475

month:  4
Train Size:  (8028, 104)
Test Size:  (720, 104)
Train R2:  0.9853806716511504
Test R2:  0.8119027155583844
Edge R2:  0.4664384366380724

month:  5
Train Size:  (8004, 104)
Test Size:  (744, 104)
Train R2:  0.985145475820985
Test R2:  0.8198180456076628
Edge R2:  0.4710568125615293

month:  6
Train Size:  (8028, 104)
Test Size:  (720, 104)
Train R2:  0.9851799994558399
Test R2:  0.8193104331624613
Edge R2:  0.46820236718558833

month:  7
Train Size:  (8004, 104)
Test Size:  (744, 104)
Train R2:  0.9852876332795035
Test R2:  0.790157026058725


In [186]:
networkPrediction['Date'] = data.reset_index().iloc[12:]['Date'].values
networkPrediction['Hour'] = data.reset_index().iloc[12:]['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sPCA2017%s.csv'%(hub,pca_comps),index=False)

In [187]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.7989982498911402
0.45383918802091827


In [58]:
res_df = pd.concat(residualDf_list, axis = 0)
res_df.to_csv('../../Resid/'+hub+'PCARoundup'+str(pca_comps)+'RFCV.csv',index=False)