In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV

from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour','Min'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf


In [6]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [9]:
hub = 'JFK'
tune_hyp_params = False
pca_comps = 6
granularity = 30
granularity = str(granularity)+'Min'

In [10]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleBy'+granularity+'.csv'

In [11]:
# file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [12]:
data = loadData(file)

Raw shape:  (1931102, 5)
Days:  365


In [13]:
data = getTimeSeries(data)
data = data.reset_index()

In [14]:
data.shape

(17518, 262)

In [15]:
data

DOLocationID,Date,Hour,Min,1,2,3,4,5,6,7,...,254,255,256,257,258,259,260,261,262,263
0,2018-01-01,0,0,0,0,0,1,0,0,3,...,0,2,3,0,0,1,2,0,4,6
1,2018-01-01,0,30,1,0,0,0,0,0,2,...,0,0,1,0,0,1,1,0,1,0
2,2018-01-01,1,0,0,0,1,1,0,0,1,...,0,1,1,0,1,0,1,0,0,1
3,2018-01-01,1,30,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,0,0,0
4,2018-01-01,2,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17513,2018-12-31,21,30,0,0,0,1,0,0,9,...,0,7,2,5,2,0,1,1,5,9
17514,2018-12-31,22,0,0,0,2,5,0,0,0,...,2,3,3,2,1,1,0,0,9,4
17515,2018-12-31,22,30,0,0,1,0,0,0,2,...,0,4,5,2,1,1,0,3,4,5
17516,2018-12-31,23,0,0,0,2,1,0,0,0,...,0,6,0,3,3,0,1,0,2,4


pcaData.to_csv('../../processedData/%spca%s.csv'%(hub.upper(),pca_comps),
               index=False)

In [16]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"+granularity+'/'
extFile = externalDataDir + hub.upper() + ".csv"

In [17]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(17520, 48)


Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [18]:
extDf['Date'] = pd.to_datetime(extDf['Date'], yearfirst=True)
extDf['Date'] = extDf['Date'].dt.date
extDf['Date'] = pd.to_datetime(extDf['Date'])
extDf.head(2)

Unnamed: 0,Date,Hour,Min,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,0,30,3.0,263,174,437,1,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [19]:
selected_columns = ['Date', 'Hour', 'Min', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [20]:
extDf = extDf[selected_columns]

In [21]:
lagColumns = [str(i) for i in range(1,pca_comps+1)]+['arrival']

DateColumns = ['Date','Hour','Min']

targetColumns = [str(i) for i in range(1,pca_comps+1)]

maxlag = int(12 * 60 / int(granularity.replace('Min','')))

In [29]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
rawList = []
networkPrediction = pd.DataFrame()

for m in range(1,13):

    
    month_index  = pd.to_datetime(data.Date).dt.month == m

    dataset_train = data[~month_index]
    dataset_test = data[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    matrix_train = dataset_train.drop(columns=DateColumns).values.astype(np.float64)
    matrix_test = dataset_test.drop(columns=DateColumns).values.astype(np.float64)
    scaler, s_matrix_train = standardize(matrix_train)
    s_matrix_test = scaler.transform(matrix_test)
    
    pca_train_model,pca_data_train = getPCAFeatures(s_matrix_train,n=pca_comps)
    pca_data_test = pca_train_model.transform(s_matrix_test)
    pca_data_test = pd.DataFrame(pca_data_test)
    pca_data_test.columns = [str(i+1) for i in pca_data_test]
    
    for col in DateColumns:
        pca_data_train[col] = dataset_train[col].values
        pca_data_test[col] = dataset_test[col].values
    
    pca_data_train = pca_data_train.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])
    pca_data_train_lag = addLag(pca_data_train, maxlag, lagColumns)
    
    pca_data_test = pca_data_test.merge(extDf, on=['Date', 'Hour','Min'], how='left').sort_values(by=['Date','Hour','Min'])
    pca_data_test_lag = addLag(pca_data_test, maxlag, lagColumns)
    
    X_train = pca_data_train_lag.drop(targetColumns+DateColumns , axis = 1)
    X_test = pca_data_test_lag.drop(targetColumns+DateColumns , axis = 1)
    y_train = pca_data_train_lag[targetColumns]
    y_test = pca_data_test_lag[targetColumns]
    
    
    
    
    val_size = int(matrix_train.shape[0]*0.8)
    val_fold = list(-1*np.ones(X_train.shape[0]-val_size)) + list(np.zeros(val_size))
    ps = PredefinedSplit(val_fold)
    param_grid = [{
        "n_estimators": np.arange(10, 500, 50),
        "min_samples_split": np.arange(2, 50, 20),
        'min_samples_leaf': np.arange(2, 50, 20), 
        'max_features': ['sqrt'],
        'max_depth': np.arange(10, 50, 10),
    }]
    # fit_inverse_transform=True to make sure inverse transform available
    rf = RandomForestRegressor(random_state = 2019) 
    rf_grid_search = GridSearchCV(rf, param_grid, cv=ps, scoring='r2')
    rf_grid_search.fit(X_train, y_train)

    print("Train R2: ",rf_grid_search.best_estimator_.score(X_train.values,y_train))
    test_r2 = rf_grid_search.best_estimator_.score(X_test.values,y_test)
    print("Test R2: ",test_r2)


    pca_prediction = rf_grid_search.best_estimator_.predict(X_test)

    residual = y_test - pca_prediction
    residual_df = dataset_test[['Date','Hour','Min']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    network_prediction = inverse_pca(pca_prediction,pca_train_model)

    network_prediction = inverse_standardize(network_prediction, scaler)
    
    # relu to convert all prediction to positive
#     network_prediction = np.log(1+np.e**network_prediction)
    # round up negative values to 0
#     network_prediction = np.where(network_prediction<0,0,network_prediction)
    network_prediction_df = pd.DataFrame(network_prediction)
    network_prediction_df.columns = data.drop(columns=DateColumns).columns
    networkPrediction = pd.concat([networkPrediction,network_prediction_df])
    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_r2 = r2_score(dataset_test[maxlag:].drop(columns=DateColumns).values, network_prediction )
    print("Edge R2: ",edge_r2)


    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)
#     rawList.append()

Train Size:  (16030, 262)
Test Size:  (1488, 262)
Train R2:  0.9344777370114292
Test R2:  0.7189529845387146
Edge R2:  0.16044491206191738
Train Size:  (16174, 262)
Test Size:  (1344, 262)
Train R2:  0.9338472388564307
Test R2:  0.7302057179120036
Edge R2:  0.1583149831361268
Train Size:  (16032, 262)
Test Size:  (1486, 262)
Train R2:  0.9338292599244391
Test R2:  0.7851810930977016
Edge R2:  0.17476633401491998
Train Size:  (16078, 262)
Test Size:  (1440, 262)
Train R2:  0.9346457801754023
Test R2:  0.7577618811209649
Edge R2:  0.17336723673842092
Train Size:  (16030, 262)
Test Size:  (1488, 262)
Train R2:  0.9371115480947197
Test R2:  0.7852148477790241
Edge R2:  0.18455530935902686
Train Size:  (16078, 262)
Test Size:  (1440, 262)
Train R2:  0.937086903391866
Test R2:  0.7868260769649087
Edge R2:  0.17415814020365042
Train Size:  (16030, 262)
Test Size:  (1488, 262)
Train R2:  0.9374055296199101
Test R2:  0.7582997517493687
Edge R2:  0.1675862560311437
Train Size:  (16030, 262)
Test

In [30]:
data['day'] = data['Date'].astype(str).apply(lambda x: x.split('-')[-1])
indexNames = data.loc[(data['day']=='01')&(data['Hour']<12)].index
dataPredictMatch = data.drop(indexNames)

In [31]:
networkPrediction['Date'] = dataPredictMatch.reset_index()['Date'].values
networkPrediction['Hour'] = dataPredictMatch.reset_index()['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)

In [32]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.763761794030693
0.17482511043133228


In [45]:
res_df = pd.concat(residualDf_list, axis = 0)
res_df.to_csv('../../Resid/%sPCA%sComp%s.csv'%(hub,pca_comps,granularity),index=False)