In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    data['DOLocationID'] = data['DOLocationID'].astype(str)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Load Raw Data

In [11]:
hub = 'EWR'
tune_hyp_params = True

In [12]:
dataDir = '/Users/hemingyi/Documents/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehiceByHour.csv'

In [13]:
rawdata = loadData(file)

Raw shape:  (2260080, 4)
Days:  365


In [14]:
rawdata.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,2,2018-01-01,0,0.0
1,3,2018-01-01,0,0.0


In [15]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,10,100,101,102,106,107,108,109,...,90,91,92,93,94,95,96,97,98,99
0,2018-01-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018-01-01,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-01-01,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Merge External Data Features

In [18]:
externalDataDir = "/Users/hemingyi/Documents/UrbanTemporalNetworks/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [19]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,ifsat,...,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow,arrival
0,18/1/1 0:00,263,174,437,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,24.0
1,18/1/1 1:00,138,133,271,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,9.0


In [20]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,ifsat,...,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow,arrival
0,2018-01-01 00:00:00,263,174,437,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,24.0
1,2018-01-01 01:00:00,138,133,271,1,0,0,0,0,0,...,7,12.5,-21.2,52,0,0.0,0.0,0,0,9.0


In [21]:
min(extDf.date), max(extDf.date)

(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

In [22]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [23]:
extDf.columns

Index(['date', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue', 'ifwed', 'ifthu',
       'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3', 'if4', 'if5',
       'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12', 'if13', 'if14',
       'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21', 'if22', 'if23',
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow', 'arrival', 'Hour',
       'Dow', 'Date'],
      dtype='object')

In [24]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [25]:
extDf = extDf[selected_columns]

In [26]:
print(edge_data.shape)
print(extDf.shape)

(8760, 260)
(8760, 14)


In [27]:
edge_data['Date'] = pd.to_datetime(edge_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [28]:
edge_data = pd.merge(edge_data,extDf, on=['Date', 'Hour'], how='inner')
print(edge_data.shape)
edge_data['Date'] = edge_data['Date'].dt.date
edge_data.head()

(8760, 272)


Unnamed: 0,Date,Hour,10,100,101,102,106,107,108,109,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,1,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
2,2018-01-01,2,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
3,2018-01-01,3,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
4,2018-01-01,4,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [29]:
edge_data.columns

Index(['Date', 'Hour', '10', '100', '101', '102', '106', '107', '108', '109',
       ...
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object', length=272)

In [30]:
DateColumns = ['Date']

ext_columns = ['Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

targetColumns = [c for c in edge_data.columns if c not in ext_columns and \
                c not in DateColumns and c != 'Hour']

lagColumns = targetColumns + ['arrival']

In [31]:
maxlag = 12

edge_data_lag = addLag(edge_data, maxlag, lagColumns)

edge_data_lag.shape

(8748, 3380)

In [49]:
CommR2List = []
EdgeR2List = []
residualDf_list = []

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(edge_data_lag.Date).dt.month == m

    dataset_train = edge_data_lag[~month_index]
    dataset_test = edge_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    prediction = rf2.predict(X_test)


    residual = y_test - prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)
    edge_r2 = r2_score(y_test, prediction, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 3380)
Test Size:  (732, 3380)
Train R2:  0.6000200244803663
Test R2:  0.013295092265019277
Edge R2:  0.013295092265019277

month:  2
Train Size:  (8076, 3380)
Test Size:  (672, 3380)
Train R2:  0.6011224569477656
Test R2:  0.01027793170003448
Edge R2:  0.01027793170003448

month:  3
Train Size:  (8004, 3380)
Test Size:  (744, 3380)
Train R2:  0.6021350996524343
Test R2:  0.011544836976088158
Edge R2:  0.011544836976088158

month:  4
Train Size:  (8028, 3380)
Test Size:  (720, 3380)
Train R2:  0.5998475509689539
Test R2:  0.017141094343128216
Edge R2:  0.017141094343128216

month:  5
Train Size:  (8004, 3380)
Test Size:  (744, 3380)
Train R2:  0.6003577754430864
Test R2:  0.017859716086422853
Edge R2:  0.017859716086422853

month:  6
Train Size:  (8028, 3380)
Test Size:  (720, 3380)
Train R2:  0.6023827690537995
Test R2:  0.011563613965211382
Edge R2:  0.011563613965211382

month:  7
Train Size:  (8004, 3380)
Test Size:  (744, 3380)
Train R2:  0.6025429483

In [50]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.01331702346818172
0.01331702346818172


In [51]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 260)


Unnamed: 0,Date,Hour,10,100,101,102,106,107,108,109,...,90,91,92,93,94,95,96,97,98,99
12,2018-01-01,12,0.0,-0.040241,0.0,0.0,-0.006219,-0.081037,0.0,0.0,...,-0.031309,0.0,-0.000833,0.0,0.0,0.0,0.0,-0.016487,0.0,0.0
13,2018-01-01,13,0.0,-0.046587,0.0,0.0,-0.003256,-0.098067,0.0,-0.000167,...,-0.076567,-0.001667,-0.004203,0.0,0.0,0.0,0.0,-0.010152,-0.001852,0.0
14,2018-01-01,14,0.0,0.957984,0.0,0.0,-0.006458,0.923161,0.0,-0.000171,...,0.931465,0.0,-0.010573,0.0,0.0,0.0,0.0,-0.019282,0.0,0.0
15,2018-01-01,15,-0.002167,-0.054173,0.0,0.0,0.0,-0.105475,0.0,-0.002563,...,-0.097126,-0.003333,-0.004222,0.0,0.0,-0.004444,0.0,-0.013734,-0.002063,0.0
16,2018-01-01,16,-0.001852,-0.076211,0.0,0.0,-0.007817,-0.14564,0.0,0.0,...,-0.104341,0.0,-0.001333,0.0,-0.000952,-0.003889,0.0,0.966442,-0.002222,0.0


In [44]:
dateCols = ['Date', 'Hour']
otherCols = [c for c in res_df.columns if c not in dateCols]

res_df['residual'] = np.sum(res_df[otherCols].values, axis=1)

res_df = res_df[dateCols+['residual']]
print(res_df.shape)

(8748, 3)


In [52]:
res_df.to_csv('/Users/hemingyi/Documents/UrbanTemporalNetworks/Resid/%sedgeRFCV.csv'%hub.upper())

In [None]:
res_df