In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings('ignore')

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Preparing Data

In [10]:
hub = 'PENN'
tune_hyp_params = False

In [11]:
dataDir = '/home/mingyi/Dropbox/UrbanTemporalNetworks/processedData/'
file = dataDir + hub + 'VehicleByHour2019fromHub.csv'

In [12]:
rawdata = loadData(file)

Raw shape:  (2286360, 4)
Days:  365


In [13]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,256,257,258,259,260,261,262,263,264,265
0,2019-01-01,0,0,0,0,1,0,0,3,0,...,6,0,0,0,1,5,2,2,0,25
1,2019-01-01,1,0,0,0,6,0,0,6,0,...,6,1,0,0,0,4,8,12,0,65
2,2019-01-01,2,2,0,0,7,0,0,12,0,...,12,0,1,0,1,2,6,5,0,65


### Community Level Aggregation

In [14]:
zones = pd.read_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/Data/ZonetoComm.csv')
zones.head(2)

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2


In [15]:
zones['start_community'] = zones.start_community.astype(str)

zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

In [16]:
edge_data = edge_data[[i for i in edge_data.columns if i in zones.start_id]]

In [17]:
comm_data = rawdata.copy(deep=True)
comm_data = comm_data.loc[comm_data['DOLocationID'].isin(zones.start_id)]
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])
comm_data.head(2)

Unnamed: 0,Date,DOLocationID,vehicle_count,Hour
0,2019-01-01,0.0,0.0,0
1,2019-01-01,0.0,0.0,1


In [18]:
comm_data = getTimeSeries(comm_data)
comm_data = comm_data.reset_index()
comm_data.head(2)

DOLocationID,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
0,2019-01-01,0,126,53,105,0,18,7,26,5,...,0,4,1,3,1,18,0,0,0,0
1,2019-01-01,1,147,62,212,9,25,4,43,3,...,0,5,0,4,0,17,0,0,0,0


In [19]:
rawdata = rawdata.loc[rawdata['DOLocationID'].isin(zones.start_id)]
zone_weights = get_weights(rawdata, zontoComm)
zone_weights.head(2)

Unnamed: 0,Borough,DOLocationID,zone_weight
0,0.0,1,0.0178
1,0.0,48,0.094066


### Merge External Data Features

In [20]:
externalDataDir = "/home/mingyi/Dropbox/UrbanTemporalNetworks/HongData/"+hub+'2019/'
extFile = externalDataDir+"external.csv"

In [21]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 14)


Unnamed: 0,Date,Hour,arrival,PRCP,SNOW,SNWD,TMAX,DOW,Tue,Wed,Thur,Fri,Sat,Sun
0,2019-01-01,0,0.0,0.06,0.0,0.0,58.0,1,1,0,0,0,0,0
1,2019-01-01,1,1.0,0.06,0.0,0.0,58.0,1,1,0,0,0,0,0


In [22]:
selected_columns = ['Date', 'Hour', 'arrival', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'DOW', 'Tue',
       'Wed', 'Thur', 'Fri', 'Sat', 'Sun']
extDf = extDf[selected_columns]

In [23]:
print(comm_data.shape)
print(extDf.shape)

(8760, 26)
(8760, 14)


In [24]:
comm_data['Date'] = pd.to_datetime(comm_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [25]:
comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour'], how='inner')
print(comm_data.shape)
comm_data['Date'] = comm_data['Date'].dt.date
comm_data.head()

(8760, 38)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,SNOW,SNWD,TMAX,DOW,Tue,Wed,Thur,Fri,Sat,Sun
0,2019-01-01,0,126,53,105,0,18,7,26,5,...,0.0,0.0,58.0,1,1,0,0,0,0,0
1,2019-01-01,1,147,62,212,9,25,4,43,3,...,0.0,0.0,58.0,1,1,0,0,0,0,0
2,2019-01-01,2,153,43,185,8,32,12,57,0,...,0.0,0.0,58.0,1,1,0,0,0,0,0
3,2019-01-01,3,136,40,113,11,33,6,43,3,...,0.0,0.0,58.0,1,1,0,0,0,0,0
4,2019-01-01,4,90,21,48,10,7,7,27,3,...,0.0,0.0,58.0,1,1,0,0,0,0,0


In [26]:
comm_data.columns

Index(['Date', 'Hour', '0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival', 'PRCP', 'SNOW',
       'SNWD', 'TMAX', 'DOW', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun'],
      dtype='object')

In [27]:
lagColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival']

DateColumns = ['Date']

targetColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3']

In [28]:
maxlag = 12

comm_data_lag = addLag(comm_data, maxlag, lagColumns)

comm_data_lag.shape

(8748, 338)

### Train Test split

In [29]:
# sep = int(0.75*len(comm_data_lag))
# sep

In [30]:
CommR2List = []
EdgeR2List = []
residualDf_list = []
networkPrediction = pd.DataFrame()

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(comm_data_lag.Date).dt.month == m

    dataset_train = comm_data_lag[~month_index]
    dataset_test = comm_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_testData = edge_data[edgeMonthIndex]
    select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
    edge_testData = edge_testData[select_cols]
    print("edge test data shape: ",edge_testData.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    comm_prediction = rf2.predict(X_test)
    edge_prediction_df = pd.DataFrame(comm_prediction)
    edge_prediction_df.columns = y_test.columns

    residual = y_test - comm_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    boroughs = list(edge_prediction_df.columns)
    for bor in boroughs:
    #     print(bor)

        weight_df = zone_weights[zone_weights.Borough == bor]

    #     print(len(weight_df.DOLocationID))

        for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
            edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight


    select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
    edge_prediction_df = edge_prediction_df[select_cols]


    edge_prediction_df = edge_prediction_df[edge_testData.columns]
    
    networkPrediction = pd.concat([networkPrediction,edge_prediction_df])
    edge_r2 = r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 338)
Test Size:  (732, 338)
edge test data shape:  (732, 258)
Train R2:  0.993013728911088
Test R2:  0.8607339243838857
Edge R2:  0.5230947081492228

month:  2
Train Size:  (8076, 338)
Test Size:  (672, 338)
edge test data shape:  (672, 258)
Train R2:  0.9928871974800443
Test R2:  0.8915091534736158
Edge R2:  0.5535823188473191

month:  3
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 258)
Train R2:  0.9930461663569907
Test R2:  0.8846304577647781
Edge R2:  0.536400004073706

month:  4
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 258)
Train R2:  0.9930636478542852
Test R2:  0.9035586789598218
Edge R2:  0.5586354008828073

month:  5
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 258)
Train R2:  0.9929512132076195
Test R2:  0.9149669939351328
Edge R2:  0.5650867321309248

month:  6
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 258)
Trai

In [35]:
networkPrediction['Date'] = comm_data.iloc[12:]['Date'].values
networkPrediction['Hour'] = comm_data.iloc[12:]['Hour'].values
networkPrediction.to_csv('/home/mingyi/Dropbox/UrbanTemporalNetworks/prediction/%sComm242019.csv'%hub,index=False)

In [36]:
networkPrediction

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,255,256,257,258,259,260,261,262,Date,Hour
0,3.717998,0.000125,0.052066,3.141194,0.004143,0.012244,1.193795,0.005143,0.054015,0.153569,...,1.388547,1.120521,0.250899,0.111287,0.078232,0.228736,3.843767,3.217836,2019-01-01,12
1,4.297120,0.000139,0.066700,3.724470,0.008594,0.010018,1.554326,0.006696,0.067466,0.170283,...,1.896797,1.530666,0.316593,0.123400,0.100220,0.286037,4.557501,3.936044,2019-01-01,13
2,4.742686,0.000160,0.066882,4.106657,0.006215,0.011502,1.628084,0.007014,0.067220,0.195345,...,2.115807,1.707401,0.351693,0.141561,0.100493,0.310863,5.025170,4.231559,2019-01-01,14
3,4.803422,0.000172,0.067910,4.099807,0.005141,0.021520,1.764209,0.007600,0.064270,0.209962,...,2.215252,1.787650,0.372319,0.152154,0.102037,0.306179,5.016787,4.481000,2019-01-01,15
4,4.822864,0.000186,0.076497,4.303295,0.003223,0.007421,1.916281,0.008255,0.065464,0.227874,...,2.534287,2.045103,0.412880,0.165134,0.114940,0.343026,5.265789,4.996343,2019-01-01,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,4.263064,0.000109,0.134550,4.559135,0.014732,0.033023,3.054829,0.013160,0.096018,0.133580,...,3.604186,2.908484,0.489397,0.096802,0.202167,0.574260,5.578851,4.849289,2019-12-31,19
740,3.859545,0.000092,0.153356,4.326858,0.015960,0.033765,3.318250,0.014295,0.094333,0.112430,...,3.761672,3.035571,0.518181,0.081475,0.230425,0.630625,5.294623,4.654253,2019-12-31,20
741,4.203771,0.000099,0.141867,4.634277,0.019336,0.030425,3.307429,0.014248,0.092647,0.120759,...,3.822229,3.084439,0.523576,0.087511,0.213161,0.588625,5.670799,4.875186,2019-12-31,21
742,3.664638,0.000085,0.161157,4.067953,0.021101,0.039331,3.716941,0.016012,0.111717,0.103783,...,4.112243,3.318472,0.545781,0.075209,0.242146,0.665599,4.977810,4.503786,2019-12-31,22


In [37]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.8868996542905134
0.5446546024411535


In [40]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 26)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
12,2019-01-01,12,-18.88,-21.906667,-5.937778,0.663333,-0.474444,0.453333,-0.974444,0.192222,...,-0.004444,-0.708889,-0.116667,-0.627778,-0.058889,2.342222,-0.077778,-0.06,-0.015556,-0.036667
13,2019-01-01,13,5.584444,-22.8,-39.251111,2.41,-0.693333,0.351111,-0.893333,-1.458889,...,-0.007778,-1.134444,6.891111,-1.035556,-0.06,5.935556,-0.043333,-0.124444,-0.05,-0.03
14,2019-01-01,14,-21.447778,-14.272222,-8.182222,0.126667,0.121111,0.003333,-2.151111,-1.594444,...,0.0,-0.126667,0.403333,1.787778,-0.1,1.647778,-0.08,-0.09,-0.06,-0.034444
15,2019-01-01,15,-34.86,-21.69,10.103333,3.4,8.424444,0.621111,5.277778,0.85,...,0.0,-2.033333,-1.464444,0.821111,-0.072222,-0.883333,-0.057778,-0.074444,-0.084444,-0.064444
16,2019-01-01,16,-11.952222,-31.844444,-20.378889,3.043333,0.054444,-2.514444,8.445556,-1.243333,...,-0.002222,-0.071111,-2.527778,-2.441111,-0.09,-1.476667,-0.088889,-0.046667,-0.043333,-0.022222


In [42]:
res_df.groupby(['Date']).sum().drop(columns='Hour').to_csv('/home/mingyi/Dropbox/DOE_Anomaly_Detection/GMMDataset/selected/RFCV/2019/'+hub+'Comm24RFCVResidDailyAggregated2019.csv')