In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [3]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [4]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [5]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [6]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [7]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [8]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [9]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [10]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Preparing Data

In [11]:
hub = 'JFK'
tune_hyp_params = False

In [12]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleByHour2019fromHub.csv'

In [13]:
rawdata = loadData(file)

Raw shape:  (2295120, 4)
Days:  365


In [14]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,256,257,258,259,260,261,262,263,264,265
0,2019-01-01,0,1,0,0,1,0,0,4,0,...,5,2,3,1,1,0,6,5,0,74
1,2019-01-01,1,0,0,0,0,0,0,2,0,...,4,0,0,0,1,0,0,3,0,43
2,2019-01-01,2,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,3,0,0,16


### Community Level Aggregation

In [15]:
zones = pd.read_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/Data/ZonetoComm.csv')
zones.head(2)

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2


In [16]:
zones['start_community'] = zones.start_community.astype(str)

zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

In [17]:
edge_data = edge_data[[i for i in edge_data.columns if i in zones.start_id]]

In [18]:
comm_data = rawdata.copy(deep=True)
comm_data = comm_data.loc[comm_data['DOLocationID'].isin(zones.start_id)]
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])
comm_data.head(2)

Unnamed: 0,Date,DOLocationID,vehicle_count,Hour
0,2019-01-01,0.0,1.0,0
1,2019-01-01,0.0,0.0,1


In [19]:
comm_data = getTimeSeries(comm_data)
comm_data = comm_data.reset_index()
comm_data.head(2)

DOLocationID,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
0,2019-01-01,0,30,9,38,68,39,15,37,3,...,0,24,75,18,5,24,1,0,0,0
1,2019-01-01,1,12,6,12,29,9,8,20,4,...,0,4,50,8,2,10,1,0,1,0


In [20]:
rawdata = rawdata.loc[rawdata['DOLocationID'].isin(zones.start_id)]
zone_weights = get_weights(rawdata, zontoComm)
zone_weights.head(2)

Unnamed: 0,Borough,DOLocationID,zone_weight
0,0.0,1,0.007641
1,0.0,48,0.103086


### Merge External Data Features

In [21]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"+hub+'2019/'
extFile = externalDataDir+"external.csv"

In [22]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 14)


Unnamed: 0,Date,Hour,arrival,PRCP,SNOW,SNWD,TMAX,DOW,Tue,Wed,Thur,Fri,Sat,Sun
0,2019-01-01,0,7.0,0.06,0.0,0.0,58.0,1,1,0,0,0,0,0
1,2019-01-01,1,1.0,0.06,0.0,0.0,58.0,1,1,0,0,0,0,0


In [23]:
selected_columns = ['Date', 'Hour', 'arrival', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'DOW', 'Tue',
       'Wed', 'Thur', 'Fri', 'Sat', 'Sun']
extDf = extDf[selected_columns]

In [24]:
print(comm_data.shape)
print(extDf.shape)

(8760, 26)
(8760, 14)


In [25]:
comm_data['Date'] = pd.to_datetime(comm_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [26]:
comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour'], how='inner')
print(comm_data.shape)
comm_data['Date'] = comm_data['Date'].dt.date
comm_data.head()

(8760, 38)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,SNOW,SNWD,TMAX,DOW,Tue,Wed,Thur,Fri,Sat,Sun
0,2019-01-01,0,30,9,38,68,39,15,37,3,...,0.0,0.0,58.0,1,1,0,0,0,0,0
1,2019-01-01,1,12,6,12,29,9,8,20,4,...,0.0,0.0,58.0,1,1,0,0,0,0,0
2,2019-01-01,2,7,0,11,18,10,0,7,3,...,0.0,0.0,58.0,1,1,0,0,0,0,0
3,2019-01-01,3,7,3,5,27,8,7,9,2,...,0.0,0.0,58.0,1,1,0,0,0,0,0
4,2019-01-01,4,15,2,9,9,6,3,5,2,...,0.0,0.0,58.0,1,1,0,0,0,0,0


In [27]:
comm_data.columns

Index(['Date', 'Hour', '0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival', 'PRCP', 'SNOW',
       'SNWD', 'TMAX', 'DOW', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun'],
      dtype='object')

In [28]:
lagColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival']

DateColumns = ['Date']

targetColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3']

In [29]:
maxlag = 12

comm_data_lag = addLag(comm_data, maxlag, lagColumns)

comm_data_lag.shape

(8748, 338)

### Train Test split

In [30]:
# sep = int(0.75*len(comm_data_lag))
# sep

In [31]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(comm_data_lag.Date).dt.month == m

    dataset_train = comm_data_lag[~month_index]
    dataset_test = comm_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_testData = edge_data[edgeMonthIndex]
    select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
    edge_testData = edge_testData[select_cols]
    print("edge test data shape: ",edge_testData.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    comm_prediction = rf2.predict(X_test)
    edge_prediction_df = pd.DataFrame(comm_prediction)
    edge_prediction_df.columns = y_test.columns

    residual = y_test - comm_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    boroughs = list(edge_prediction_df.columns)
    for bor in boroughs:
    #     print(bor)

        weight_df = zone_weights[zone_weights.Borough == bor]

    #     print(len(weight_df.DOLocationID))

        for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
            edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight


    select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
    edge_prediction_df = edge_prediction_df[select_cols]


    edge_prediction_df = edge_prediction_df[edge_testData.columns]
    
    networkPrediction = pd.concat([networkPrediction,edge_prediction_df])
    edge_r2 = r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 338)
Test Size:  (732, 338)
edge test data shape:  (732, 259)
Train R2:  0.983816702689855
Test R2:  0.7670552028280698
Edge R2:  0.46372985585224785

month:  2
Train Size:  (8076, 338)
Test Size:  (672, 338)
edge test data shape:  (672, 259)
Train R2:  0.9835998486111844
Test R2:  0.7957927392649893
Edge R2:  0.5122785921251406

month:  3
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 259)
Train R2:  0.9837193731088295
Test R2:  0.8313007772379023
Edge R2:  0.5381049822357898

month:  4
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 259)
Train R2:  0.9838260037766997
Test R2:  0.8163035325049883
Edge R2:  0.5031230799936616

month:  5
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 259)
Train R2:  0.9845779838123064
Test R2:  0.786814116172603
Edge R2:  0.44321446211901394

month:  6
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 259)
Tr

In [32]:
networkPrediction['Date'] = comm_data.iloc[12:]['Date'].values
networkPrediction['Hour'] = comm_data.iloc[12:]['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sComm242019.csv'%hub,index=False)

In [33]:
networkPrediction

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,255,256,257,258,259,260,261,262,Date,Hour
0,0.988397,0.004959,0.298125,1.577287,0.036553,0.076812,4.326217,0.025170,0.715470,9.757071,...,6.608404,4.092201,1.052024,2.030441,0.404548,1.565552,3.152765,3.359019,2019-01-01,12
1,1.417862,0.006205,0.411047,2.382985,0.058926,0.121529,6.148103,0.035770,0.877367,12.207759,...,9.438483,5.844704,1.528811,2.540428,0.557780,1.885401,4.763238,5.369709,2019-01-01,13
2,1.718999,0.006844,0.477797,2.941281,0.056809,0.142806,6.901481,0.040153,0.917133,13.466874,...,11.073697,6.857298,1.836691,2.802450,0.648358,1.977364,5.879191,6.575278,2019-01-01,14
3,1.812798,0.006999,0.512523,3.222548,0.061044,0.160837,7.209383,0.041945,0.952219,13.770464,...,12.214881,7.563966,2.053595,2.865626,0.695480,2.057923,6.441401,7.030767,2019-01-01,15
4,1.855766,0.007015,0.490658,3.477825,0.074395,0.190408,7.495373,0.043609,0.968966,13.801459,...,12.604915,7.805492,2.136058,2.872076,0.665811,2.053879,6.951663,7.468744,2019-01-01,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,1.061047,0.006622,0.494774,2.207767,0.074947,0.177065,6.775429,0.039420,0.972436,13.029012,...,10.925528,6.765545,1.637678,2.711331,0.671396,2.282906,4.413003,5.483542,2019-12-31,19
740,1.184977,0.007313,0.542875,2.678780,0.080563,0.174901,7.975411,0.046401,1.073427,14.388051,...,13.401030,8.298479,2.053387,2.994146,0.736668,2.567920,5.354490,6.639231,2019-12-31,20
741,1.007890,0.006841,0.511429,2.337361,0.070988,0.172737,7.305391,0.042503,0.972245,13.459704,...,12.469423,7.721590,1.824925,2.800957,0.693996,2.430338,4.672042,5.705423,2019-12-31,21
742,0.917045,0.006951,0.518889,2.164505,0.069423,0.188965,7.252078,0.042193,0.969380,13.675859,...,12.271336,7.598925,1.745057,2.845939,0.704119,2.466729,4.326530,5.142042,2019-12-31,22


In [34]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.8043197734951697
0.49008317330535506


In [39]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 26)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
12,2019-01-01,12,-7.352222,8.62,16.466667,3.608889,9.221111,6.332222,4.932222,-0.715556,...,-0.034444,2.031111,-15.738889,0.222222,-1.336667,-8.278889,0.631111,2.558889,0.65,-0.236667
13,2019-01-01,13,-24.556667,6.544444,22.502222,4.138889,22.912222,-7.87,-4.942222,-0.676667,...,-0.013333,-1.618889,-51.283333,-5.205556,1.325556,-1.24,-0.67,-0.711111,0.583333,-0.374444
14,2019-01-01,14,-25.966667,15.902222,22.66,8.822222,16.026667,9.457778,4.885556,4.831111,...,-0.023333,-3.006667,-35.38,-3.191111,-0.334444,10.054444,-0.703333,1.314444,-0.531111,-0.44
15,2019-01-01,15,-58.242222,12.194444,15.152222,12.531111,13.062222,-1.754444,1.484444,-2.082222,...,-0.01,-2.231111,-36.296667,2.945556,-1.441111,0.54,-0.931111,2.263333,-0.712222,-0.495556
16,2019-01-01,16,-24.865556,17.11,52.707778,-0.295556,31.414444,2.886667,25.296667,-2.898889,...,-0.008889,9.184444,-52.594444,-10.011111,2.007778,14.133333,0.183333,2.102222,1.354444,-0.586667


In [42]:

res_df.groupby(['Date']).sum().drop(columns='Hour').to_csv('/home/mingyi/Dropbox/DOE_Anomaly_Detection/GMMDataset/selected/RFCV/2019/'+hub+'Comm24RFCVResidDailyAggregated2019.csv')

In [41]:
res_df.groupby(['Date']).sum().drop(columns='Hour')

Unnamed: 0_level_0,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,2.1,2.2,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,-388.984444,170.067778,435.890000,-24.866667,142.832222,-26.102222,104.324444,-4.090000,5.720000,31.608889,...,-0.205556,-40.214444,-537.993333,-55.375556,-1.285556,56.558889,-5.510000,9.743333,2.997778,-2.586667
2019-01-02,-421.238889,103.822222,313.455556,17.160000,259.914444,-2.868889,392.653333,7.625556,38.284444,67.991111,...,-0.436667,-133.146667,-531.390000,-80.778889,-7.611111,78.461111,-5.446667,-1.202222,-3.183333,5.134444
2019-01-03,-460.675556,6.037778,-133.783333,33.735556,63.801111,0.440000,278.252222,3.275556,-4.592222,83.012222,...,-0.532222,-110.614444,-491.488889,-9.683333,-8.561111,-11.020000,-2.487778,-0.946667,-2.948889,4.446667
2019-01-04,-402.726667,-40.283333,-262.070000,73.400000,67.531111,-67.916667,271.614444,-5.286667,0.512222,70.685556,...,0.602222,-131.585556,-473.462222,-29.731111,-2.521111,-10.700000,-1.955556,-1.105556,-4.057778,3.668889
2019-01-05,-653.090000,50.458889,31.317778,30.426667,108.778889,-113.662222,199.544444,-22.436667,-4.995556,80.732222,...,-0.338889,-106.380000,-416.161111,12.905556,-16.461111,14.270000,3.747778,-1.102222,-5.874444,-1.657778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,-241.166667,-92.133333,-129.542222,135.064444,174.188889,-54.407778,553.907778,1.670000,-12.905556,21.975556,...,-0.398889,38.426667,167.548889,28.508889,-11.364444,57.328889,-1.453333,-11.766667,1.457778,-1.095556
2019-12-28,-321.327778,-45.972222,51.992222,277.154444,263.712222,-41.538889,583.426667,-20.226667,-22.198889,20.694444,...,0.540000,85.037778,63.302222,15.892222,-27.381111,116.310000,-5.930000,-6.591111,0.578889,-2.446667
2019-12-29,-426.814444,-50.518889,11.081111,333.127778,377.846667,-12.142222,828.895556,-8.668889,-4.953333,66.621111,...,0.550000,123.215556,267.002222,74.996667,1.858889,235.748889,-0.043333,-8.230000,-4.107778,-3.195556
2019-12-30,-533.154444,-35.312222,-89.863333,354.677778,355.378889,-0.606667,564.687778,-25.765556,5.397778,98.597778,...,1.586667,89.750000,257.211111,25.592222,-7.845556,80.496667,-4.127778,-5.578889,-5.107778,5.724444
