In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour','Min'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf


In [6]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [9]:
hub = 'JFK'
tune_hyp_params = False
pca_comps = 6
granularity = 30
granularity = str(granularity)+'Min'

In [10]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleBy'+granularity+'.csv'

In [11]:
# file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [12]:
data = loadData(file)

Raw shape:  (1931102, 5)
Days:  365


In [13]:
data = getTimeSeries(data)

In [14]:
matrix = data.values.astype(np.float64)

In [15]:
scaler, s_matrix = standardize(matrix)

In [34]:
# comparing in-sample PCA reconstruction performance
pca = PCA(n_components=pca_comps)

pca.fit(s_matrix)
train_matrix = pca.transform(s_matrix)
train_matrix_re = inverse_pca(train_matrix,pca)
train_matrix_re_re = inverse_standardize(train_matrix_re, scaler)
r2_score(matrix,train_matrix_re_re,multioutput='variance_weighted')

0.53338325606503

In [36]:
# comparing out-of-sample PCA reconstruction performance
pca = PCA(n_components=pca_comps)
train_size = int(s_matrix.shape[0]*0.9)
pca.fit(s_matrix[:train_size])
test_matrix = pca.transform(s_matrix[train_size:])
test_matrix_re = inverse_pca(test_matrix,pca)
test_matrix_re_re = inverse_standardize(test_matrix_re, scaler)
r2_score(matrix[train_size:],test_matrix_re_re,multioutput='variance_weighted')

0.5474155825259369

In [16]:
pca,pcaData = getPCAFeatures(s_matrix,n=pca_comps)

In [17]:
pcaData.index = data.index
pcaData = pcaData.reset_index()

pcaData.to_csv('../../processedData/%spca%s.csv'%(hub.upper(),pca_comps),
               index=False)

In [18]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"+granularity+'/'
extFile = externalDataDir + hub.upper() + ".csv"

In [19]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(17520, 48)


Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [20]:
extDf['Date'] = pd.to_datetime(extDf['Date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [21]:
extDf.columns

Index(['Date', 'Hour', 'Min', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon',
       'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1',
       'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11',
       'if12', 'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20',
       'if21', 'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure',
       'hdd', 'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [22]:
selected_columns = ['Date', 'Hour', 'Min', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [23]:
extDf = extDf[selected_columns]

In [24]:
print(pcaData.shape)
print(extDf.shape)

(17518, 9)
(17520, 14)


In [25]:
pcaData['Date'] = pd.to_datetime(pcaData['Date'])
extDf['Date'] = extDf['Date'].dt.date
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [26]:
pcaData.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])

Unnamed: 0,Date,Hour,Min,1,2,3,4,5,6,arrival,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,-2.057730,-4.711828,1.521514,1.063268,0.006574,-0.322672,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
1,2018-01-01,0,30,-4.418915,-2.983973,0.540287,0.318801,-0.517975,-1.387351,3.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
2,2018-01-01,1,0,-5.189840,-4.089339,-0.444477,1.567658,-1.777515,0.692551,4.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
3,2018-01-01,1,30,-8.386220,-1.333953,1.023051,0.447744,-1.087052,-0.217918,2.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
4,2018-01-01,2,0,-9.390652,-1.140547,1.554639,0.399773,-0.720378,-0.001228,1.0,18,7,12.5,-21.2,52,0,0.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,8.100723,-6.820958,2.324746,-0.626444,-0.270728,-1.883929,7.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17514,2018-12-31,22,0,4.929631,-6.131226,-0.128409,0.424989,0.534639,-0.013831,6.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17515,2018-12-31,22,30,6.135447,-7.281046,0.503391,0.614796,-0.983932,-0.665315,8.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0
17516,2018-12-31,23,0,4.234694,-7.937970,-1.334994,3.634372,-0.539482,-2.135490,2.0,49,30,39.5,5.6,25,0,1.23,0.0,0,0


In [27]:
pcaData = pcaData.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])
print(pcaData.shape)

(17518, 20)


In [28]:
lagColumns = [str(i) for i in range(1,pca_comps+1)]+['arrival']

DateColumns = ['Date','Hour','Min']

targetColumns = [str(i) for i in range(1,pca_comps+1)]

In [29]:
maxlag = int(12 * 60 / int(granularity.replace('Min','')))

pcaData_lag = addLag(pcaData, maxlag, lagColumns)

pcaData_lag.shape

(17494, 188)

In [32]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
rawList = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()

    print("month: ",m)
    month_index  = pd.to_datetime(pcaData_lag.Date).dt.month == m

    dataset_train = pcaData_lag[~month_index]
    dataset_test = pcaData_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]



    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    pca_prediction = rf2.predict(X_test)

    residual = y_test - pca_prediction
    residual_df = dataset_test[['Date','Hour','Min']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    network_prediction = inverse_pca(pca_prediction,pca)

    network_prediction = inverse_standardize(network_prediction, scaler)
    
    # relu to convert all prediction to positive
#     network_prediction = np.log(1+np.e**network_prediction)
    # round up negative values to 0
#     network_prediction = np.where(network_prediction<0,0,network_prediction)
    network_prediction_df = pd.DataFrame(network_prediction)
    network_prediction_df.columns = data.columns
    networkPrediction = pd.concat([networkPrediction,network_prediction_df])
    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_r2 = r2_score(data[edgeMonthIndex], network_prediction, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)


    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (16030, 188)
Test Size:  (1464, 188)
Train R2:  0.9840219738014176
Test R2:  0.718796539792179
Edge R2:  0.39325689827650834

month:  2
Train Size:  (16150, 188)
Test Size:  (1344, 188)
Train R2:  0.9838777518847927
Test R2:  0.7370834276458296
Edge R2:  0.4199872093521893

month:  3
Train Size:  (16008, 188)
Test Size:  (1486, 188)
Train R2:  0.9838312790969316
Test R2:  0.7881190240055549
Edge R2:  0.4400925329914415

month:  4
Train Size:  (16054, 188)
Test Size:  (1440, 188)
Train R2:  0.984002416090671
Test R2:  0.7634757484348242
Edge R2:  0.45787417248452134

month:  5
Train Size:  (16006, 188)
Test Size:  (1488, 188)
Train R2:  0.9837378484485242
Test R2:  0.7895447806400643
Edge R2:  0.4665577374872785

month:  6
Train Size:  (16054, 188)
Test Size:  (1440, 188)
Train R2:  0.9838065919389605
Test R2:  0.7883601883533851
Edge R2:  0.4308644632496977

month:  7
Train Size:  (16006, 188)
Test Size:  (1488, 188)
Train R2:  0.9838516871979379
Test R2:  0.763

In [41]:
networkPrediction

DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
0,0.524073,0.001316,0.141263,0.651923,0.006039,0.059001,1.742082,0.019440,0.232477,2.482570,...,0.147316,2.910346,1.641870,0.475509,0.682196,0.185961,0.543583,1.658890,1.546873,1.988314
1,0.599961,0.002257,0.156526,0.871217,0.021931,0.052465,2.209131,0.015840,0.285590,2.940575,...,0.175218,3.744718,2.144423,0.629075,0.760526,0.205354,0.635560,1.995869,2.032622,2.660907
2,0.611432,0.002831,0.150824,1.090923,0.033899,0.054468,2.710191,0.016644,0.324522,3.446197,...,0.226413,4.514959,2.666638,0.748452,0.870857,0.263748,0.745045,2.219103,2.523505,3.335058
3,0.731271,0.003717,0.179181,1.252991,0.026655,0.076642,3.053300,0.016653,0.381165,3.826889,...,0.263393,5.410343,3.099787,0.838651,0.890028,0.303154,0.788185,2.789346,2.949660,3.886605
4,0.762024,0.002922,0.180573,1.319592,0.037626,0.079343,3.195326,0.021606,0.387240,3.992496,...,0.261690,5.565546,3.217789,0.882850,0.950185,0.310825,0.841841,2.847107,3.055416,4.041199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,0.520671,0.001528,0.241973,1.643538,0.014787,0.067901,3.897051,0.021631,0.508474,5.115286,...,0.294288,5.763523,3.823612,1.028319,1.378204,0.338942,1.017055,2.144344,3.388558,4.645837
1484,0.451308,0.002011,0.221701,1.558902,0.012055,0.061723,3.675499,0.017611,0.471433,4.855683,...,0.275241,5.409156,3.630065,0.962979,1.334572,0.321301,0.963939,2.019584,3.167058,4.381377
1485,0.393088,0.001352,0.232465,1.485567,0.022017,0.056647,3.523315,0.023123,0.404873,4.844549,...,0.293656,5.007810,3.410682,0.892753,1.406123,0.313598,0.930320,1.878363,2.940816,4.077461
1486,0.420605,0.001939,0.237991,1.328653,0.018520,0.054839,3.197485,0.016966,0.386593,4.443257,...,0.287034,4.517919,3.075262,0.816477,1.299058,0.288818,0.846807,1.736997,2.632742,3.637346


In [42]:
data.reset_index().iloc[maxlag:]

DOLocationID,Date,Hour,Min,1,2,3,4,5,6,7,...,254,255,256,257,258,259,260,261,262,263
24,2018-01-01,12,0,0,0,0,0,0,0,2,...,0,4,0,1,2,0,1,1,3,5
25,2018-01-01,12,30,1,0,0,2,0,0,4,...,0,1,3,0,0,0,1,1,4,8
26,2018-01-01,13,0,0,0,0,1,1,0,1,...,0,3,1,0,0,0,3,1,1,1
27,2018-01-01,13,30,1,0,0,1,0,0,2,...,0,3,1,3,0,0,0,1,7,3
28,2018-01-01,14,0,1,0,0,2,0,1,4,...,0,8,10,0,1,0,0,3,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,0,0,0,1,0,0,9,...,0,7,2,5,2,0,1,1,5,9
17514,2018-12-31,22,0,0,0,2,5,0,0,0,...,2,3,3,2,1,1,0,0,9,4
17515,2018-12-31,22,30,0,0,1,0,0,0,2,...,0,4,5,2,1,1,0,3,4,5
17516,2018-12-31,23,0,0,0,2,1,0,0,0,...,0,6,0,3,3,0,1,0,2,4


In [43]:
networkPrediction['Date'] = data.reset_index().iloc[maxlag:]['Date'].values
networkPrediction['Hour'] = data.reset_index().iloc[maxlag:]['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)

In [37]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.7674292599350928
0.4372554573024243


In [45]:
res_df = pd.concat(residualDf_list, axis = 0)
res_df.to_csv('../../Resid/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)