In [1]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [3]:
### Comm 

In [5]:
zoneDict = pd.read_csv('../processedData/ZonetoComm24.csv')
# del zoneDict['Unnamed: 0']
zoneDict

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2
2,3,2.1
3,4,0.2
4,5,5.1
...,...,...
258,259,2.1
259,260,4.3
260,261,0.2
261,262,3.0


In [6]:
def R2inAnomaly(hub,method):
    label = pd.read_csv('../processedData/'+hub+'VehicleByHour.csv')
    label = getTimeSeries(label)
    label = label.reset_index()
    prediction = pd.read_csv('../prediction/'+hub+method+'.csv')

    label = label.merge(prediction[['Date','Hour']],on=['Date','Hour'],how='inner')
    label_value = label[sorted([int(x) for x in label.columns if x not in ['Date', 'Hour']])].values
    selctedColumn = sorted([int(x) for x in prediction.columns if x not in ['Date', 'Hour']])

    prediction_value = prediction[[str(col) for col in selctedColumn]].values
#     print(label_value.shape)
    print('overall R2')
    print('R2: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))
    print('------------------------')
    events = pd.read_csv('/home/mingyi/Dropbox/DOE_Anomaly_Detection/%sEvents.csv'%hub.upper())
    label_anomaly = events.merge(label[sorted([int(x) for x in label.columns if x not in ['Date', 'Hour']])+['Date']],
                                 how='inner',on='Date')
    prediction_anomaly = events.merge(prediction[[str(col) for col in selctedColumn]+['Date']],how='inner',on='Date')
    label_anomaly_value = label_anomaly[[x for x in label_anomaly.columns 
                                                         if x not in ['Date', 'Hour','Type','Name']]].values
    prediction_anomaly_value = prediction_anomaly[[x for x in prediction_anomaly.columns 
                                                         if x not in ['Date', 'Hour','Type','Name']]].values
    print(label_anomaly_value.shape)
    print(prediction_anomaly_value.shape)
    print('Artificially selected events')
#     print(events)
    print('------------------------')
    print('R2: ',r2_score(label_anomaly_value, prediction_anomaly_value, multioutput='variance_weighted'))
    print('------------------------')
    r2 = r2_score(label_anomaly_value, prediction_anomaly_value, multioutput='variance_weighted')
    mae = mean_absolute_error(label_anomaly_value, prediction_anomaly_value)
    rmse = mean_squared_error(label_anomaly_value, prediction_anomaly_value,squared=False)
    return(r2, mae, rmse)
#     print('Clustered Anomaly')
#     count = pd.read_csv('/Users/hemingyi/Documents/DOE_Anomaly_Detection/%sTrueTimesSummary.csv'%hub.upper())
#     count = count[['%s%sRFCVLogResid'%(hub.upper(),method),'Date']]
#     anomaly = count[count['%s%sRFCVLogResid'%(hub.upper(),method)]>90]
#     label_anomaly = anomaly.merge(label,how='left',on='Date')
#     prediction_anomaly = anomaly.merge(prediction,how='left',on='Date')
#     label_anomaly_value = label_anomaly[[x for x in label_anomaly.columns 
#                                                          if x not in ['Date', 'Hour','Type','Name']]].values
#     prediction_anomaly_value = prediction_anomaly[[x for x in prediction_anomaly.columns 
#                                                          if x not in ['Date', 'Hour','Type','Name']]].values
#     print(anomaly)
#     print('------------------------')
#     print('R2: ',r2_score(label_anomaly_value, prediction_anomaly_value, multioutput='variance_weighted'))

In [7]:
def R2inWeek(hub,method):
    label = pd.read_csv('../processedData/'+hub+'VehicleByHour.csv')
    label = getTimeSeries(label)
    label = label.reset_index()
    prediction = pd.read_csv('../prediction/'+hub+method+'.csv')

    label = label.merge(prediction[['Date','Hour']],on=['Date','Hour'],how='inner')
    label_value = label[sorted([int(x) for x in label.columns if x not in ['Date', 'Hour']])].values

    selctedColumn = sorted([int(x) for x in prediction.columns if x not in ['Date', 'Hour','DOW']])
    prediction_value = prediction[[str(col) for col in selctedColumn]].values
    print('R2 in total: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))

    #     print(label_value.shape)
    label['DOW'] = pd.to_datetime(label['Date'])
    label['DOW'] = label['DOW'].dt.dayofweek
    labelWeekend = label.loc[label['DOW']>4]
    labelWeek = label.loc[label['DOW']<5]


    prediction['DOW'] = pd.to_datetime(prediction['Date'])
    prediction['DOW'] = prediction['DOW'].dt.dayofweek
    predictionWeekend = prediction.loc[prediction['DOW']>4]
    predictionWeek = prediction.loc[prediction['DOW']<5]

    label_value = labelWeekend[sorted([int(x) for x in labelWeekend.columns if x not in ['Date', 
                                                                                         'Hour','DOW']])].values
    

    prediction_value = predictionWeekend[[str(col) for col in selctedColumn]].values

    print('R2 in weekend: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))


    label_value = labelWeek[sorted([int(x) for x in labelWeek.columns if x not in ['Date', 
                                                                                         'Hour','DOW']])].values
    prediction_value = predictionWeek[[str(col) for col in selctedColumn]].values

    print('R2 in weekdays: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))

In [8]:
def R2inPeak(hub,method):
    label = pd.read_csv('../processedData/'+hub+'VehicleByHour.csv')
    label = getTimeSeries(label)
    peak = label.sum(axis=1).groupby(['Hour']).sum().idxmax()
    print('peak: ',peak)
    label = label.reset_index()

    prediction = pd.read_csv('../prediction/'+hub+method+'.csv')

    label = label.merge(prediction[['Date','Hour']],on=['Date','Hour'],how='inner')
    selctedColumn = sorted([int(x) for x in prediction.columns if x not in ['Date', 'Hour','DOW']])


    #     print(label_value.shape)    
    labelPeak = label.loc[label['Hour']==peak]
    labelNonPeak = label.loc[label['Hour']!=peak]


    predictionPeak = prediction.loc[prediction['Hour']==peak]
    predictionNonPeak = prediction.loc[prediction['Hour']!=peak]

    label_value = labelPeak[sorted([int(x) for x in labelPeak.columns if x not in ['Date', 
                                                                                         'Hour','DOW']])].values
    

    prediction_value = predictionPeak[[str(col) for col in selctedColumn]].values

    print('R2 in peak: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))


    label_value = labelNonPeak[sorted([int(x) for x in labelNonPeak.columns if x not in ['Date', 
                                                                                         'Hour','DOW']])].values
    prediction_value = predictionNonPeak[[str(col) for col in selctedColumn]].values

    print('R2 in nonpeak: ',r2_score(label_value, prediction_value, multioutput='variance_weighted'))

In [None]:
label = pd.read_csv('../processedData/'+hub+'VehicleByHour.csv')
label = getTimeSeries(label)
# label = label.reset_index()
peak = label.sum(axis=1).groupby(['Hour']).sum().idxmax()
print('peak: ',peak)

### artificially selected events

In [23]:
hub = 'JFK'
method = 'LSTMPipeline'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'w') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5893650472372185
------------------------
(458, 258)
(458, 258)
Artificially selected events
------------------------
R2:  0.605358191518573
------------------------


In [24]:
hub = 'LGA'
method = 'LSTMPipeline'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7731004334318198
------------------------
(465, 257)
(465, 257)
Artificially selected events
------------------------
R2:  0.7496395505540963
------------------------


In [25]:
hub = 'JFK'
method = 'LSTMComm24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5709650213048197
------------------------
(458, 258)
(458, 258)
Artificially selected events
------------------------
R2:  0.5455553897773519
------------------------


In [26]:
hub = 'LGA'
method = 'LSTMComm24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7234709856094016
------------------------
(458, 257)
(458, 257)
Artificially selected events
------------------------
R2:  0.7082953643446084
------------------------


In [27]:
hub = 'JFK'
method = 'Comm6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5617753177169981
------------------------
(468, 258)
(468, 258)
Artificially selected events
------------------------
R2:  0.50962341462421
------------------------


In [28]:
hub = 'LGA'
method = 'Comm6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.6843443029443619
------------------------
(468, 257)
(468, 257)
Artificially selected events
------------------------
R2:  0.6672724140227766
------------------------


In [29]:
hub = 'Penn'
method = 'Comm6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.46101699655707745
------------------------
(372, 257)
(372, 257)
Artificially selected events
------------------------
R2:  0.3694044650097103
------------------------


In [30]:
hub = 'JFK'
method = 'Comm24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5453822615756456
------------------------
(468, 258)
(468, 258)
Artificially selected events
------------------------
R2:  0.5141521152944809
------------------------


In [31]:
hub = 'LGA'
method = 'Comm24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7178714963562547
------------------------
(468, 257)
(468, 257)
Artificially selected events
------------------------
R2:  0.6976400564443659
------------------------


In [32]:
hub = 'Penn'
method = 'Comm24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5419649358015061
------------------------
(372, 257)
(372, 257)
Artificially selected events
------------------------
R2:  0.4433159756088794
------------------------


In [33]:
hub = 'JFK'
method = 'Boro'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.4997950041596569
------------------------
(468, 258)
(468, 258)
Artificially selected events
------------------------
R2:  0.46310697111200544
------------------------


In [34]:
hub = 'LGA'
method = 'Boro'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.627865577133875
------------------------
(468, 257)
(468, 257)
Artificially selected events
------------------------
R2:  0.6209897931812762
------------------------


In [35]:
hub = 'Penn'
method = 'Boro'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.4318355494064083
------------------------
(372, 257)
(372, 257)
Artificially selected events
------------------------
R2:  0.34165655048829874
------------------------


hub = 'JFK'
method = 'Edgewise'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

hub = 'LGA'
method = 'Edgewise'
R2inAnomaly(hub,method)

In [36]:
hub = 'JFK'
method = 'PCA6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5764443809596166
------------------------
(468, 258)
(468, 258)
Artificially selected events
------------------------
R2:  0.533139602161029
------------------------


In [37]:
hub = 'JFK'
method = 'PCA24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.5617753177169981
------------------------
(468, 258)
(468, 258)
Artificially selected events
------------------------
R2:  0.50962341462421
------------------------


In [39]:
hub = 'LGA'
method = 'PCA6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7796913414374378
------------------------
(468, 257)
(468, 257)
Artificially selected events
------------------------
R2:  0.7489078454501321
------------------------


In [40]:
hub = 'LGA'
method = 'PCA24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7695173138947347
------------------------
(468, 257)
(468, 257)
Artificially selected events
------------------------
R2:  0.7240587111656636
------------------------


In [41]:
hub = 'Penn'
methods = ['PCA6','PCA24','Comm6','Comm24','Boro']
for method in methods:
    print(method)
    R2inWeek(hub, method)

PCA6
R2 in total:  0.7067755077395412
R2 in weekend:  0.6026300096739073
R2 in weekdays:  0.725419846721056
PCA24
R2 in total:  0.7157662255507917
R2 in weekend:  0.6077416647799687
R2 in weekdays:  0.7360776282080561
Comm6
R2 in total:  0.46101699655707745
R2 in weekend:  0.3543058365983419
R2 in weekdays:  0.4684693113806167
Comm24
R2 in total:  0.5419649358015061
R2 in weekend:  0.4311973371688822
R2 in weekdays:  0.5546552431046673
Boro
R2 in total:  0.4318355494064083
R2 in weekend:  0.3386177477032331
R2 in weekdays:  0.4335911931736769


In [42]:
hub = 'Penn'
methods = ['PCA6','PCA24','Comm6','Comm24','Boro']
for method in methods:
    print(method)
    R2inPeak(hub, method)

PCA6
peak:  19
R2 in peak:  0.19757239656785594
R2 in nonpeak:  0.709663417623836
PCA24
peak:  19
R2 in peak:  0.196070341260537
R2 in nonpeak:  0.719099469511267
Comm6
peak:  19
R2 in peak:  -0.45588800876163027
R2 in nonpeak:  0.4659352647062415
Comm24
peak:  19
R2 in peak:  -0.23652931116830653
R2 in nonpeak:  0.5461301482597625
Boro
peak:  19
R2 in peak:  -0.4798289549938089
R2 in nonpeak:  0.4358976955375159


In [43]:
hub = 'Penn'
method = 'PCA6'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7067755077395412
------------------------
(372, 257)
(372, 257)
Artificially selected events
------------------------
R2:  0.5792372297535827
------------------------


In [45]:
hub = 'Penn'
method = 'PCA24'
r2, mae, rmse = R2inAnomaly(hub,method)
with open('Evaluation.csv', 'a') as f:
    f.write(hub+',r2,'+method+','+str(r2)+'\n')
    f.write(hub+',mae,'+method+','+str(mae)+'\n')
    f.write(hub+',rmse,'+method+','+str(rmse)+'\n')

overall R2
R2:  0.7157662255507917
------------------------
(372, 257)
(372, 257)
Artificially selected events
------------------------
R2:  0.5566129834836325
------------------------


In [46]:
eva = pd.read_csv('Evaluation.csv',header=None)

In [47]:
eva.columns = ['Hub','Metrics','Method','Value']
eva.head()

Unnamed: 0,Hub,Metrics,Method,Value
0,JFK,r2,LSTMPipeline,0.605358
1,JFK,mae,LSTMPipeline,1.030203
2,JFK,rmse,LSTMPipeline,1.891437
3,LGA,r2,LSTMPipeline,0.74964
4,LGA,mae,LSTMPipeline,0.901782


In [49]:
eva['Value'] = eva.Value.round(2)

In [50]:
pd.pivot_table(eva,values=['Value'],index=['Method'],columns=['Hub','Metrics']).reset_index()

Unnamed: 0_level_0,Method,Value,Value,Value,Value,Value,Value,Value,Value,Value
Hub,Unnamed: 1_level_1,JFK,JFK,JFK,LGA,LGA,LGA,Penn,Penn,Penn
Metrics,Unnamed: 1_level_2,mae,r2,rmse,mae,r2,rmse,mae,r2,rmse
0,Boro,1.21,0.46,2.22,1.09,0.62,2.6,1.07,0.34,3.21
1,Comm24,1.18,0.51,2.11,1.03,0.7,2.33,0.99,0.44,2.95
2,Comm6,1.2,0.51,2.12,1.05,0.67,2.44,1.04,0.37,3.14
3,LSTMComm24,1.13,0.55,2.03,0.99,0.71,2.29,,,
4,LSTMPipeline,1.03,0.61,1.89,0.9,0.75,2.12,,,
5,PCA24,1.2,0.51,2.12,1.03,0.72,2.22,0.91,0.56,2.63
6,PCA6,1.17,0.53,2.07,0.98,0.75,2.12,0.88,0.58,2.56
