In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
#     data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df,freq):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour','Min'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    time = pd.date_range('2018-01-01', '2018-12-31',freq=freq)
    time = pd.DataFrame(time, columns=['time'])
    
    time['Date'] = time['time'].dt.date.astype('str')
    time['Hour'] = time['time'].dt.hour.astype('int')
    time['Min'] = time['time'].dt.minute.astype('int')
    del time['time']

    table = table.merge(time, on=['Date','Hour','Min'], how='right')
    table.fillna(0, inplace=True)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1)
    dataset = dataset.iloc[maxlag:]
    dataset = dataset.fillna(0)
    return dataset

### Community Level Aggregation

In [7]:
def communityAggregation(rawdata,community=6):
    filePath = rootDir + 'Data/ZonetoComm.csv'
    zones = pd.read_csv(filePath)
    if community == 6:
        zones['start_community'] = zones.start_community.astype(int)
    elif community == 24:
        zones['start_community'] = zones.start_community.astype(str)
    zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))
    agg_data = rawdata.copy(deep=True)
    agg_data['DOLocationID'] = agg_data['DOLocationID'].apply(lambda x:zontoComm[x])
    agg_data = getTimeSeries(agg_data)
    agg_data = agg_data.reset_index()
    zone_weights = get_weights(rawdata, zontoComm)
    targetColumns = sorted(zones['start_community'].unique().tolist())
    lagColumns = targetColumns + ['arrival']
    DateColumns = ['Date']
    return (agg_data, zone_weights,lagColumns,targetColumns,DateColumns)

In [8]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

### Merge External Data Features

In [9]:
def externalFeatures(hub,agg_data, maxlag, lagColumns):
    externalDataDir = rootDir+'HongData/'
    extFile = externalDataDir + hub.upper() + ".csv"
    extDf = pd.read_csv(extFile)
    extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
    extDf['Hour'] = extDf['date'].dt.hour
    extDf['Dow'] = extDf['date'].dt.dayofweek
    extDf['Date'] = extDf['date'].dt.date
    extDf['Min'] = extDf['date'].dt.minute
    selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']
    extDf = extDf[selected_columns]
    agg_data['Date'] = pd.to_datetime(agg_data['Date'])
    extDf['Date'] = pd.to_datetime(extDf['Date'])
    agg_data = pd.merge(agg_data,extDf, on=['Date', 'Hour'], how='inner')
    agg_data['Date'] = agg_data['Date'].dt.date
    agg_data_lag = addLag(agg_data, maxlag, lagColumns)
    return agg_data_lag

In [10]:
hub = 'LGA'
granularity = 15
tune_hyp_params = False

In [11]:
rootDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/'
dataDir = rootDir + 'processedData/'

file = dataDir + hub + 'VehicleBy'+str(granularity)+'Min.csv'

In [12]:
rawdata = loadData(file)

Raw shape:  (2201234, 5)
Days:  365


In [13]:
freq = str(granularity)+'min'
edge_data = getTimeSeries(rawdata,freq=freq)
edge_data = edge_data.set_index(['Date','Hour','Min'])
edge_data.sort_values(by=['Date','Hour','Min'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Min,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2018-01-01,0,0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0
2018-01-01,0,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2018-01-01,0,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2018-01-01,0,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2018-01-01,1,0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-30,23,0,0.0,0.0,0.0,2.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,1.0,5.0,0.0,1.0,2.0,1.0,0.0,2.0,4.0
2018-12-30,23,15,0.0,0.0,0.0,6.0,0.0,0.0,5.0,0.0,0.0,0.0,...,1.0,11.0,1.0,0.0,1.0,0.0,4.0,6.0,8.0,12.0
2018-12-30,23,30,0.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,0.0,1.0,...,0.0,2.0,8.0,1.0,0.0,1.0,0.0,0.0,1.0,9.0
2018-12-30,23,45,0.0,0.0,2.0,0.0,0.0,0.0,9.0,0.0,1.0,0.0,...,0.0,6.0,1.0,1.0,0.0,0.0,2.0,1.0,3.0,0.0


In [15]:
freq = str(granularity)+'min'
comm = 24
maxlag = 24
zones = pd.read_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/Data/ZonetoComm.csv')
if comm == 6:
    zones['start_community'] = zones.start_community.astype(int)
elif comm == 24:
    zones['start_community'] = zones.start_community.astype(str)
zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

comm_data = rawdata.copy(deep=True)
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])

comm_data = getTimeSeries(comm_data, '15min')
comm_data = comm_data.reset_index()
zone_weights = get_weights(rawdata, zontoComm)

In [26]:
freq = str(granularity)+'min'
comm = 24
maxlag = 24
zones = pd.read_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/Data/ZonetoComm.csv')
if comm == 6:
    zones['start_community'] = zones.start_community.astype(int)
elif comm == 24:
    zones['start_community'] = zones.start_community.astype(str)
zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

comm_data = rawdata.copy(deep=True)
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])

comm_data = getTimeSeries(comm_data, '15min')
comm_data = comm_data.reset_index()
zone_weights = get_weights(rawdata, zontoComm)

externalDataDir = "/HongData/"+str(granularity)+'min/'
extFile = rootDir + externalDataDir + hub.upper() + ".csv"
extDf = pd.read_csv(extFile)
# extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf['Hour'] = extDf['Hour'].astype('int')
extDf['Min'] = extDf['Min'].astype('int')

selected_columns = ['Date', 'Hour','Min', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']



extDf = extDf[selected_columns]

comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour','Min'], how='inner')
comm_data = comm_data.sort_values(by=['Date','Hour','Min'])
if comm == 6:
    lagColumns = ['0','1', '2', '3', '4', '5','arrival']
else:
    lagColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3','arrival']
targetColumns = [col for col in lagColumns if col!='arrival']
DateColumns = ['Date','Hour','Min']
comm_data_lag = addLag(comm_data, maxlag, lagColumns)
comm_data_lag = comm_data_lag.sort_values(by=['Date','Hour','Min'])

maxlag = 12
agg_data, zone_weights, lagColumns,targetColumns,DateColumns = communityAggregation(rawdata,community=24)
comm_data_lag = externalFeatures(hub,agg_data, maxlag, lagColumns)

### Train Test split

In [None]:
CommR2List = []
EdgeR2List = []
# residualDf_list = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(comm_data_lag.Date).dt.month == m

    dataset_train = comm_data_lag[~month_index]
    dataset_test = comm_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_testData = edge_data[edgeMonthIndex]
    select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
    edge_testData = edge_testData[select_cols]
    print("edge test data shape: ",edge_testData.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    comm_prediction = rf2.predict(X_test)
    edge_prediction_df = pd.DataFrame(comm_prediction)
    edge_prediction_df.columns = y_test.columns

    residual = y_test - comm_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    boroughs = list(edge_prediction_df.columns)
    for bor in boroughs:
    #     print(bor)

        weight_df = zone_weights[zone_weights.Borough == bor]

    #     print(len(weight_df.DOLocationID))

        for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
            edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight


    select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
    edge_prediction_df = edge_prediction_df[select_cols]


    edge_prediction_df = edge_prediction_df[edge_testData.columns]
    
    networkPrediction = pd.concat([networkPrediction,edge_prediction_df])
    edge_r2 = r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (31969, 640)
Test Size:  (2952, 640)
edge test data shape:  (2952, 258)


In [234]:
networkPrediction = pd.DataFrame()
networkPredictionStd = pd.DataFrame()
PCAPredictedDF = pd.DataFrame()
for m in range(1,13):
    print()

    print("month: ",m)
    month_index  = pd.to_datetime(pcaData_lag.Date).dt.month == m

    dataset_train = pcaData_lag[~month_index]
    dataset_test = pcaData_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]


    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)
    
    PCAPredicted = rf2.predict(X_test)
    PCAPredictedDF = pd.concat([PCAPredictedDF,pd.DataFrame(PCAPredicted)])
    for no_tree in range(rf2.n_estimators):
        predict = rf2.estimators_[no_tree].predict(X_test)
        if no_tree == 0:
            predict_values = predict
        else:
            predict_values = np.vstack((predict_values,predict))

    predict_values_inversePCA = inverse_pca(predict_values,pca)
    predict_values_inverseStandard = inverse_standardize(predict_values_inversePCA, scaler)
    predict_values_inverseStandard_df = pd.DataFrame(predict_values_inverseStandard)
    predict_values_inverseStandard_df['Date'] = np.tile(dataset_test['Date'],rf2.n_estimators)
    predict_values_inverseStandard_df['Hour'] = np.tile(dataset_test['Hour'],rf2.n_estimators)
    predict_values_inverseStandard_df['Min'] = np.tile(dataset_test['Min'],rf2.n_estimators)
#     network_prediction_mean = predict_values_inverseStandard.mean(axis=1)
#     network_prediction_std = predict_values_inverseStandard.std(axis=1)
    network_prediction_mean_df = predict_values_inverseStandard_df.groupby(['Date','Hour','Min']).mean()
    network_prediction_std_df = predict_values_inverseStandard_df.groupby(['Date','Hour','Min']).std()
    
    network_prediction_mean_df.columns = [str(col) + '_mean' for col in edge_data.columns]
    networkPrediction = pd.concat([networkPrediction,network_prediction_mean_df])
    
    network_prediction_std_df.columns = [str(col) + '_std' for col in edge_data.columns]
    networkPredictionStd = pd.concat([networkPredictionStd,network_prediction_std_df])
    



month:  1
Train Size:  (15985, 639)
Test Size:  (1464, 639)

month:  2
Train Size:  (16105, 639)
Test Size:  (1344, 639)

month:  3
Train Size:  (15961, 639)
Test Size:  (1488, 639)

month:  4
Train Size:  (16009, 639)
Test Size:  (1440, 639)

month:  5
Train Size:  (15961, 639)
Test Size:  (1488, 639)

month:  6
Train Size:  (16009, 639)
Test Size:  (1440, 639)

month:  7
Train Size:  (15961, 639)
Test Size:  (1488, 639)

month:  8
Train Size:  (15961, 639)
Test Size:  (1488, 639)

month:  9
Train Size:  (16009, 639)
Test Size:  (1440, 639)

month:  10
Train Size:  (15961, 639)
Test Size:  (1488, 639)

month:  11
Train Size:  (16009, 639)
Test Size:  (1440, 639)

month:  12
Train Size:  (16008, 639)
Test Size:  (1441, 639)


In [235]:
# aggregated R2
r2_score(pcaData_lag[targetColumns], PCAPredictedDF, multioutput='variance_weighted')

0.6521540440924711

In [224]:
networkPrediction.to_csv(rootDir+'/prediction/%sPCA'%hub+str(pca_comps)+'Mean'+
                         str(granularity)+'Min'+str(maxlag)+'lag.csv',index=False)

In [225]:
networkPredictionStd.to_csv(rootDir+'/prediction/%sPCA'%hub+str(pca_comps)+'Std'+
                            str(granularity)+'Min'+str(maxlag)+'lag.csv',index=False)

In [236]:
r2_score(edge_data.iloc[maxlag:], networkPrediction, multioutput='variance_weighted')

0.673448837970505

In [210]:
networkPrediction

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0_mean,1_mean,2_mean,3_mean,4_mean,5_mean,6_mean,7_mean,8_mean,9_mean,...,256_mean,257_mean,258_mean,259_mean,260_mean,261_mean,262_mean,263_mean,264_mean,265_mean
Date,Hour,Min,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2018-01-01,6,0,0.017662,0.251026,-0.000107,0.101448,0.586587,0.019011,0.016041,1.587028,0.009788,0.202507,...,1.355020,0.423774,0.507407,0.140172,0.436238,0.837516,1.493704,1.971721,0.229733,21.341498
2018-01-01,6,15,0.017839,0.262316,-0.000002,0.096212,0.578203,0.018456,0.017548,1.548158,0.009548,0.193304,...,1.340479,0.414223,0.477188,0.134107,0.421787,0.892589,1.476419,1.944990,0.241409,21.004804
2018-01-01,6,30,0.018468,0.266052,0.000081,0.088442,0.535803,0.018882,0.020040,1.443214,0.009287,0.178794,...,1.239302,0.379203,0.442348,0.121607,0.393708,0.836771,1.357219,1.793928,0.241458,19.439839
2018-01-01,6,45,0.018353,0.236522,0.000302,0.079399,0.438120,0.013203,0.016620,1.201683,0.006795,0.149416,...,1.026139,0.314855,0.421064,0.106670,0.352381,0.755024,1.104780,1.443657,0.206547,16.853532
2018-01-01,7,0,0.017700,0.274786,0.000275,0.091593,0.503290,0.016807,0.022940,1.380106,0.008262,0.178308,...,1.179243,0.359990,0.475349,0.122300,0.395501,0.869601,1.254781,1.646822,0.246688,18.816239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-30,23,0,0.022957,0.180919,0.000485,0.165186,1.386548,0.007888,0.039228,3.153349,0.016357,0.347246,...,3.166451,0.863148,1.005159,0.220152,0.734949,1.709667,2.997537,4.113963,0.262693,35.458590
2018-12-30,23,15,0.022963,0.217225,0.000437,0.178056,1.518671,0.011552,0.045846,3.454453,0.018654,0.382558,...,3.468031,0.945388,1.060805,0.239188,0.791983,1.903305,3.302366,4.532938,0.309564,38.698505
2018-12-30,23,30,0.022407,0.219712,0.000402,0.180470,1.491726,0.012110,0.045439,3.419003,0.018429,0.384830,...,3.408581,0.933979,1.071634,0.241237,0.793314,1.860946,3.245837,4.451902,0.307307,38.403394
2018-12-30,23,45,0.022211,0.208126,0.000415,0.180145,1.465121,0.010822,0.043686,3.363481,0.017804,0.380367,...,3.349450,0.919150,1.076603,0.239807,0.787455,1.816628,3.179967,4.360025,0.293550,37.830441


In [223]:
predictedValueFromRF = inverse_pca(PCAPredictedDF.values,pca)
predictedValueFromRF = inverse_standardize(predictedValueFromRF, scaler)
r2_score(edge_data.iloc[maxlag:], predictedValueFromRF, multioutput='variance_weighted')


0.6850171408413117

In [222]:
'/prediction/%sPCA'%hub+str(pca_comps)+'Std'+str(granularity)+'Min'+str(maxlag)+'lag.csv'

'/prediction/LGAPCA5Std30Min24lag.csv'