In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Preparing Data

In [10]:
hub = 'Penn'
tune_hyp_params = False

In [11]:
dataDir = '/Users/hemingyi/Documents/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehiceByHour2015.csv'

In [12]:
rawdata = loadData(file)

Raw shape:  (2277600, 4)
Days:  365


In [13]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,...,254.0,255.0,256.0,257.0,258.0,259.0,260.0,261.0,262.0,263.0
0,2015-01-01,0,0,0,0,2,0,0,4,0,...,0,6,6,0,0,0,0,2,0,2
1,2015-01-01,1,0,0,2,2,0,0,12,0,...,0,4,14,0,0,0,0,0,0,10
2,2015-01-01,2,4,0,0,16,0,0,4,0,...,0,4,6,0,0,0,0,0,0,4


### Community Level Aggregation

In [14]:
zones = pd.read_csv('/Users/hemingyi/Documents/UrbanTemporalNetworks/Data/ZonetoComm.csv')
zones.head(2)

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2


In [15]:
zones['start_community'] = zones.start_community.astype(int).astype(str)

zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

In [16]:
comm_data = rawdata.copy(deep=True)
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])
comm_data.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,0,2015-01-01,0,0.0
1,4,2015-01-01,0,0.0


In [17]:
comm_data = getTimeSeries(comm_data)
comm_data = comm_data.reset_index()
comm_data.head(2)

DOLocationID,Date,Hour,0,1,2,3,4,5
0,2015-01-01,0,364,26,6,26,16,0
1,2015-01-01,1,416,84,4,48,28,0


In [18]:
zone_weights = get_weights(rawdata, zontoComm)
zone_weights.head(2)

Unnamed: 0,Borough,DOLocationID,zone_weight
0,0,1.0,0.004323
1,0,4.0,0.008518


### Merge External Data Features

In [19]:
externalDataDir = "/Users/hemingyi/Documents/UrbanTemporalNetworks/HongData/"
extFile = externalDataDir + hub.upper() + "2015.csv"

In [20]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 41)


Unnamed: 0,Date,Hour,arrival,ifmon,iftue,ifwed,ifthu,iffri,ifsat,ifsun,...,if21,if22,if23,participation,SNOW,snowdepth,avgtemp,maxtemp,mintemp,ifSnow
0,2015-01-01,0,0.0,0,0,0,1,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,39.0,27.0,0
1,2015-01-01,1,1.0,0,0,0,1,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,39.0,27.0,0


In [21]:
extDf['date'] = pd.to_datetime(extDf['Date'], yearfirst=True)
del extDf['Date']
extDf.head(2)

Unnamed: 0,Hour,arrival,ifmon,iftue,ifwed,ifthu,iffri,ifsat,ifsun,if0,...,if22,if23,participation,SNOW,snowdepth,avgtemp,maxtemp,mintemp,ifSnow,date
0,0,0.0,0,0,0,1,0,0,0,1,...,0,0,0.0,0.0,0.0,0.0,39.0,27.0,0,2015-01-01
1,1,1.0,0,0,0,1,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,39.0,27.0,0,2015-01-01


In [22]:
min(extDf.date), max(extDf.date)

(Timestamp('2015-01-01 00:00:00'), Timestamp('2015-12-31 00:00:00'))

In [23]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [24]:
extDf.columns

Index(['Hour', 'arrival', 'ifmon', 'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat',
       'ifsun', 'if0', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8',
       'if9', 'if10', 'if11', 'if12', 'if13', 'if14', 'if15', 'if16', 'if17',
       'if18', 'if19', 'if20', 'if21', 'if22', 'if23', 'participation', 'SNOW',
       'snowdepth', 'avgtemp', 'maxtemp', 'mintemp', 'ifSnow', 'date', 'Dow',
       'Date'],
      dtype='object')

In [25]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 
       'participation', 'snowdepth', 'ifSnow']

In [26]:
extDf = extDf[selected_columns]

In [27]:
print(comm_data.shape)
print(extDf.shape)

(8760, 8)
(8760, 10)


In [28]:
comm_data['Date'] = pd.to_datetime(comm_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [29]:
comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour'], how='inner')
print(comm_data.shape)
comm_data['Date'] = comm_data['Date'].dt.date
comm_data.head()

(8760, 16)


Unnamed: 0,Date,Hour,0,1,2,3,4,5,Dow,arrival,maxtemp,mintemp,avgtemp,participation,snowdepth,ifSnow
0,2015-01-01,0,364,26,6,26,16,0,3,0.0,39.0,27.0,0.0,0.0,0.0,0
1,2015-01-01,0,364,26,6,26,16,0,3,1.0,39.0,27.0,0.0,0.0,0.0,0
2,2015-01-01,0,364,26,6,26,16,0,3,1.0,39.0,27.0,0.0,0.0,0.0,0
3,2015-01-01,0,364,26,6,26,16,0,3,0.0,39.0,27.0,0.0,0.0,0.0,0
4,2015-01-01,0,364,26,6,26,16,0,3,0.0,39.0,27.0,0.0,0.0,0.0,0


In [30]:
comm_data.columns

Index(['Date', 'Hour', '0', '1', '2', '3', '4', '5', 'Dow', 'arrival',
       'maxtemp', 'mintemp', 'avgtemp', 'participation', 'snowdepth',
       'ifSnow'],
      dtype='object')

In [31]:
lagColumns = ['0', '1', '2', '3', '4', '5', 'arrival']

DateColumns = ['Date']

targetColumns = ['0', '1', '2', '3', '4', '5']

In [32]:
maxlag = 12

comm_data_lag = addLag(comm_data, maxlag, lagColumns)

comm_data_lag.shape

(8748, 100)

In [33]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 3)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = range(2,10)
# Minimum number of samples required at each leaf node
min_samples_leaf = range(2,10)
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [150, 225, 300], 'max_features': ['sqrt'], 'max_depth': [50, 65, 80, 95, 110, None], 'min_samples_split': range(2, 10), 'min_samples_leaf': range(2, 10), 'bootstrap': [True, False]}


In [34]:
if tune_hyp_params:
    rf2 = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf2, param_distributions = random_grid, n_iter = 5, \
                                   cv = 5, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, y_train)
    print(rf_random.best_params_)

In [None]:
CommR2List = []
EdgeR2List = []
residualDf_list = []

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(comm_data_lag.Date).dt.month == m

    dataset_train = comm_data_lag[~month_index]
    dataset_test = comm_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_testData = edge_data[edgeMonthIndex]
    select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
    edge_testData = edge_testData[select_cols]
    print("edge test data shape: ",edge_testData.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

#     rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
#                                min_samples_split=3,
#                                min_samples_leaf= 2, 
#                                max_features= 'sqrt',
#                                max_depth= None, 
#                                bootstrap= False)
#     rf2.fit(X_train,y_train)
    rf = RandomForestRegressor()
    rf2 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, \
                                   cv = 5, verbose=2, random_state=42, n_jobs = -1)
    rf2.fit(X_train,y_train).fit(X_train, y_train)

   

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    comm_prediction = rf2.predict(X_test)
    edge_prediction_df = pd.DataFrame(comm_prediction)
    edge_prediction_df.columns = y_test.columns

    residual = y_test - comm_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    boroughs = list(edge_prediction_df.columns)
    for bor in boroughs:
    #     print(bor)

        weight_df = zone_weights[zone_weights.Borough == bor]

    #     print(len(weight_df.DOLocationID))

        for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
            edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight


    select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
    edge_prediction_df = edge_prediction_df[select_cols]


    edge_prediction_df = edge_prediction_df[edge_testData.columns]

    edge_r2 = r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 100)
Test Size:  (732, 100)
edge test data shape:  (732, 260)
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
