In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [94]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [95]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [96]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [97]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [98]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [99]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [100]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [101]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Load Raw Data

In [141]:
hub = 'LGA'
tune_hyp_params = True

In [142]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleByHour.csv'

In [143]:
rawdata = loadData(file)

Raw shape:  (2251320, 4)
Days:  365


In [144]:
rawdata.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,1,2018-01-01,0,0.0
1,2,2018-01-01,0,0.0


In [145]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()

edge_data.columns = [str(col) for col in edge_data.columns]

In [146]:
edge_data.head()

Unnamed: 0,Date,Hour,1,2,3,4,5,6,7,8,...,254,255,256,257,258,259,260,261,262,263
0,2018-01-01,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,2,1,0,4,1
1,2018-01-01,1,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,1
2,2018-01-01,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-01-01,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2018-01-01,4,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Merge External Data Features

In [147]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [148]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,18/1/1 0:00,3,89,67,156,1,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
1,18/1/1 1:00,0,17,8,25,1,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0


In [149]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01 00:00:00,3,89,67,156,1,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
1,2018-01-01 01:00:00,0,17,8,25,1,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0


In [150]:
min(extDf.date), max(extDf.date)

(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

In [151]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [152]:
extDf.columns

Index(['date', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue',
       'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3',
       'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12',
       'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21',
       'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'Hour', 'Dow',
       'Date'],
      dtype='object')

In [153]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [154]:
extDf = extDf[selected_columns]

In [155]:
print(edge_data.shape)
print(extDf.shape)

(8760, 259)
(8760, 14)


In [156]:
edge_data['Date'] = pd.to_datetime(edge_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [157]:
edge_data = pd.merge(edge_data,extDf, on=['Date', 'Hour'], how='inner')
print(edge_data.shape)
edge_data['Date'] = edge_data['Date'].dt.date
edge_data.head()

(8760, 271)


Unnamed: 0,Date,Hour,1,2,3,4,5,6,7,8,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,0,0,0,0,0,1,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
1,2018-01-01,1,0,0,0,0,0,0,2,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
2,2018-01-01,2,0,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
3,2018-01-01,3,0,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0
4,2018-01-01,4,1,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.0,0.0,0,0


In [158]:
edge_data.columns

Index(['Date', 'Hour', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object', length=271)

In [159]:
DateColumns = ['Date']

ext_columns = ['Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

targetColumns = [c for c in edge_data.columns if c not in ext_columns and \
                c not in DateColumns and c != 'Hour']

lagColumns = targetColumns + ['arrival']

In [160]:
maxlag = 12

edge_data_lag = addLag(edge_data, maxlag, lagColumns)

edge_data_lag.shape

(8748, 3367)

In [161]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(edge_data_lag.Date).dt.month == m

    dataset_train = edge_data_lag[~month_index]
    dataset_test = edge_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    prediction = rf2.predict(X_test)
    networkPrediction = pd.concat([networkPrediction,pd.DataFrame(prediction)])
    
    residual = y_test - prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)


    edge_r2 = r2_score(y_test, prediction, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 3367)
Test Size:  (732, 3367)
Train R2:  0.9590862507390181
Test R2:  0.7398002487297443
Edge R2:  0.7398002487297443

month:  2
Train Size:  (8076, 3367)
Test Size:  (672, 3367)
Train R2:  0.9588404549745673
Test R2:  0.7627386680249988
Edge R2:  0.7627386680249988

month:  3
Train Size:  (8004, 3367)
Test Size:  (744, 3367)
Train R2:  0.9587355913308016
Test R2:  0.7795912378456176
Edge R2:  0.7795912378456176

month:  4
Train Size:  (8028, 3367)
Test Size:  (720, 3367)
Train R2:  0.9586372916030289
Test R2:  0.779856004133306
Edge R2:  0.779856004133306

month:  5
Train Size:  (8004, 3367)
Test Size:  (744, 3367)
Train R2:  0.9584610767540824
Test R2:  0.7901840196475141
Edge R2:  0.7901840196475141

month:  6
Train Size:  (8028, 3367)
Test Size:  (720, 3367)
Train R2:  0.9585303997089091
Test R2:  0.781856320475927
Edge R2:  0.781856320475927

month:  7
Train Size:  (8004, 3367)
Test Size:  (744, 3367)
Train R2:  0.9590460381121997
Test R2:  0.7535458

In [162]:
edge_data

Unnamed: 0,Date,Hour,1,2,3,4,5,6,7,8,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,0,0,0,0,0,1,0,...,19,8,13.5,-20.5,51,0,0.00,0.0,0,0
1,2018-01-01,1,0,0,0,0,0,0,2,0,...,19,8,13.5,-20.5,51,0,0.00,0.0,0,0
2,2018-01-01,2,0,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.00,0.0,0,0
3,2018-01-01,3,0,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.00,0.0,0,0
4,2018-01-01,4,1,0,0,0,0,0,0,0,...,19,8,13.5,-20.5,51,0,0.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,19,0,0,1,0,0,0,8,0,...,47,35,41.0,6.8,24,0,1.31,0.0,0,0
8756,2018-12-31,20,0,0,0,2,0,0,12,0,...,47,35,41.0,6.8,24,0,1.31,0.0,0,0
8757,2018-12-31,21,0,0,1,1,0,0,18,0,...,47,35,41.0,6.8,24,0,1.31,0.0,0,0
8758,2018-12-31,22,0,0,0,0,0,0,14,1,...,47,35,41.0,6.8,24,0,1.31,0.0,0,0


In [163]:
networkPrediction['Date'] = edge_data['Date'][12:].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sEdgewise.csv'%hub,index=False)

In [139]:
networkPrediction

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,256,257
0,1.231111,0.007778,0.226667,1.324444,0.016667,0.080000,3.943333,0.032222,0.443333,5.321111,...,0.341111,6.473333,3.676667,1.093333,1.592222,0.420000,1.253333,3.615556,3.515556,4.660000
1,1.760000,0.000000,0.293333,1.883333,0.055556,0.123333,5.536667,0.022222,0.682222,6.912222,...,0.418889,9.396667,5.138889,1.516667,1.817778,0.531111,1.613333,4.784444,5.223333,7.048889
2,1.852222,0.005556,0.337778,2.262222,0.045556,0.120000,5.915556,0.021111,0.738889,7.083333,...,0.517778,10.607778,5.726667,1.724444,1.738889,0.722222,1.542222,5.514444,5.672222,7.552222
3,1.550000,0.000000,0.328889,2.934444,0.053333,0.188889,7.065556,0.022222,0.787778,8.524444,...,0.521111,12.545556,7.376667,2.007778,1.753333,0.771111,1.922222,6.316667,6.911111,9.705556
4,1.562222,0.002222,0.417778,3.032222,0.038889,0.175556,7.008889,0.031111,0.854444,8.340000,...,0.543333,12.547778,7.590000,2.066667,1.943333,0.798889,1.706667,6.217778,7.057778,9.896667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,1.177778,0.002222,0.372222,2.476667,0.062222,0.094444,6.070000,0.034444,0.674444,8.725556,...,0.496667,8.503333,5.493333,1.570000,2.117778,0.596667,1.554444,3.724444,5.098889,6.933333
740,1.260000,0.000000,0.401111,3.340000,0.042222,0.131111,7.080000,0.040000,0.862222,9.676667,...,0.528889,11.907778,7.155556,1.944444,2.434444,0.628889,1.796667,4.696667,6.904444,9.343333
741,1.128889,0.003333,0.436667,2.967778,0.042222,0.133333,6.981111,0.018889,0.826667,9.470000,...,0.536667,10.681111,6.714444,1.948889,2.403333,0.583333,1.802222,4.217778,6.518889,8.955556
742,1.033333,0.003333,0.487778,2.665556,0.045556,0.097778,6.516667,0.033333,0.631111,9.535556,...,0.456667,9.477778,6.080000,1.853333,2.418889,0.592222,1.734444,3.701111,5.896667,7.750000


In [66]:
rawValue = pd.read_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/JFKVehicleByHour.csv')
rawValue = getTimeSeries(rawValue)
rawValue = rawValue.iloc[12:]

In [67]:
rawValue = rawValue.reset_index()
rawValue.columns = [str(col) for col in rawValue.columns]

In [68]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 260)


Unnamed: 0,Date,Hour,1,10,100,101,102,106,107,108,...,90,91,92,93,94,95,96,97,98,99
12,2018-01-01,12,-0.217778,-0.307778,-3.355556,0.341111,0.293333,-0.825556,3.883333,-0.497778,...,0.158889,-0.723333,-0.654444,2.502222,-0.096667,0.58,-0.083333,4.07,-0.911111,-0.006667
13,2018-01-01,13,-0.574444,0.243333,-1.045556,1.111111,-0.983333,-0.134444,1.901111,1.417778,...,4.341111,-1.188889,1.73,0.416667,-0.117778,-1.981111,-0.12,4.102222,-1.515556,-0.005556
14,2018-01-01,14,0.246667,-2.325556,-5.365556,1.931111,-1.085556,-1.304444,8.923333,-0.443333,...,4.944444,1.526667,-1.41,1.38,-0.148889,2.321111,0.89,3.497778,-0.722222,-0.004444
15,2018-01-01,15,1.306667,2.084444,-2.641111,-0.103333,-1.156667,2.561111,-2.141111,0.428889,...,5.97,-0.682222,-0.027778,-0.604444,-0.18,3.507778,-0.172222,3.798889,-0.67,-0.013333
16,2018-01-01,16,0.448889,0.614444,-2.744444,-1.033333,-0.21,-1.437778,3.762222,-0.552222,...,8.92,-0.516667,2.277778,0.375556,-0.154444,-0.677778,-0.121111,1.645556,-1.474444,0.0


In [69]:
res_df = res_df[rawValue.columns]
res_dfV = res_df[[col for col in rawValue.columns if col not in ['Date','Hour']]].values
rawValueV = res_df[[col for col in rawValue.columns if col not in ['Date','Hour']]].values


In [33]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.5449130903193496
0.5449130903193496


In [37]:
dateCols = ['Date', 'Hour']
otherCols = [c for c in res_df.columns if c not in dateCols]

res_df['residual'] = np.sum(res_df[otherCols].values, axis=1)

res_df = res_df[dateCols+['residual']]
print(res_df.shape)

            Date  Hour    residual
12    2018-01-01    12   58.634444
13    2018-01-01    13  -51.757778
14    2018-01-01    14  119.351111
15    2018-01-01    15  105.615556
16    2018-01-01    16   84.226667
17    2018-01-01    17   55.403333
18    2018-01-01    18  -79.283333
19    2018-01-01    19  101.992222
20    2018-01-01    20 -194.812222
21    2018-01-01    21  340.793333
22    2018-01-01    22  372.237778
23    2018-01-01    23  151.171111
24    2018-01-02     0 -199.312222
25    2018-01-02     1 -153.616667
26    2018-01-02     2 -112.723333
27    2018-01-02     3 -106.292222
28    2018-01-02     4   26.444444
29    2018-01-02     5  170.775556
30    2018-01-02     6  285.501111
31    2018-01-02     7    1.186667
32    2018-01-02     8   87.631111
33    2018-01-02     9  -13.704444
34    2018-01-02    10  145.344444
35    2018-01-02    11  -86.628889
36    2018-01-02    12  -44.345556
37    2018-01-02    13   19.070000
38    2018-01-02    14  -12.728889
39    2018-01-02    

In [39]:
res_df.to_csv('/home/urwa/Documents/Projects/NYU Remote/project/data/residuals/jfk_edge.csv')