In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import os
from sklearn.mixture import GaussianMixture 
from sklearn.decomposition import PCA
import datetime
import operator
import warnings
warnings.filterwarnings('ignore')

In [2]:
RecordWritingPath = '/Users/hemingyi/Documents/capstone/post/result/'
TransportationDataPath = '/Users/hemingyi/Documents/capstone/post/transportation/'
EventDataPath = '/Users/hemingyi/Documents/capstone/post/events/'
comboPath = '/Users/hemingyi/Documents/capstone/combo/'
PostData = '/Users/hemingyi/Documents/capstone/post/'
# dataFile = TransportationDataPath+city+'EdgeYearwiseAggregated.csv'

In [23]:
# generate data for website only
for city in ['Taipei','DC','Chicago','NewYork']:
    print(city)
    data = pd.read_csv(TransportationDataPath+city+'EdgeDatewiseAggregated.csv')
    startId = data.groupby(['start_id','date']).sum()
    endId = data.groupby(['end_id','date']).sum()
    startPivot = pd.pivot_table(startId, values='amount', index=['date'],
                        columns=['start_id'], aggfunc=sum, fill_value=0)
    # startPivot.columns = [x+'start' for x in startPivot.columns]
    endPivot = pd.pivot_table(endId, values='amount', index=['date'],
                        columns=['end_id'], aggfunc=sum, fill_value=0)
    # endPivot.columns = [x+'end' for x in endPivot.columns]
    startPivot.to_csv(WebData+city+'inflow.csv')
    endPivot.to_csv(WebData+city+'outflow.csv')
    print('done')
print('all done')

Taipei
done
DC
done
Chicago
done
NewYork
done
all done


In [3]:
def anomalyDetection(y,ncom,pval = 0.2,iterN=20):
    #index of regular (non-outlier points)

    rind = np.array(range(y.shape[0]))
    
    #clustering model
    gm=GaussianMixture(n_components=ncom,n_init=100,max_iter=1000,random_state=0) 
    for i in range(iterN): #iterate
#         print('Iteration {}'.format(i+1))  
        clustering=gm.fit(y[rind,:]) #fit EM clustering model excluding outliers
        l=clustering.score_samples(y) #estimate likelihood for each point
        Lthres=sorted(l)[int(len(l)*pval)] #anomaly threshold
#         print(Lthres)
        rind0=0+rind
        rind=l>Lthres #non-anomalous points
        if all(rind==rind0):
#             print('Convergence in {} iterations'.format(i+1))
            break
    return l < Lthres

In [4]:
# import events data
def getEvents(EventDataPath,city):
    events_data =EventDataPath+city+'Events.csv'
    df_events = pd.read_csv(events_data, encoding = "ISO-8859-1", parse_dates=['Date'], infer_datetime_format=True)

    # dataframe for events
    df_finalEvents =  df_events[['Date', 'Type']]

    # list events666
    
    lis_event = df_finalEvents['Type'].unique()
    lis_event = list(lis_event)
    return (lis_event,df_finalEvents)

In [5]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='amount', index=['date'],
                    columns=['start_id','end_id'], aggfunc=np.sum, fill_value=0).sort_index()
    return table

In [99]:
def pipeline(city):
    f = open(RecordWritingPath+'InOutFlowPCA.csv', 'a+')
    print('Initialize')
    lis_event,df_finalEvents = getEvents(EventDataPath,city)
    data = pd.read_csv(TransportationDataPath+city+'InOutFlow.csv', parse_dates=['date'])
    data = data.sort_values(['date'])
    
    matrix = data.drop(['date'], axis=1).values
    matrix = np.log(matrix+1)
    for i in range(matrix.shape[1]):
        matrix[:, i] = (matrix[:, i] - matrix[:, i].min()) / (matrix[:, i].max() - matrix[:, i].min())
    pca = PCA(n_components=16)
    matrix=pca.fit_transform(matrix)
    date = data.date.to_frame().rename(columns={'date':'Date'})
#     print(date.head())
    threresult = {}
    EventsDF = df_finalEvents.drop_duplicates(subset='Date', keep='first', inplace=False)
    EventsDF['Anomaly'] = True
    EventsDF['Date'] = EventsDF['Date'].astype('str')
    date['Date'] = date['Date'].astype('str')
    df = date.merge(EventsDF,on='Date',how='left')
    df.Anomaly.fillna(False, inplace=True)
    for comp in [1,2,3,4,5]:
        print('n_component',comp)
        for thres in list(range(1,10,1))+[10*len(df[df['Anomaly']==True])/len(df)]:
            th = thres/10
            outliers = anomalyDetection(matrix,comp,pval = th)
            df['outlier'] = outliers
            TP = len(df[(df['outlier']==True)&(df['Anomaly']==True)])
            FP = len(df[(df['outlier']==True)&(df['Anomaly']==False)])
            TN = len(df[(df['outlier']==False)&(df['Anomaly']==False)])
            FN = len(df[(df['outlier']==False)&(df['Anomaly']==True)])
            f.write(city+',Comm + PCA + GMM,'+str(comp)+','+str(th)+','+str(TP)+','+str(FP)+','+str(TN)+','+str(FN)+',')
            for event in ['National Holiday', 'Culture Event', 'Extreme Weather']:
    #             print(event)
                T = len(df[df['Type']==event])
                TP = len(df[(df['Type']==event)&(df['outlier']==True)])
                TPR = TP/T
                f.write(str(TPR)+',')
            f.write('\n')
    f.close()

In [110]:
for city in ['Chicago','NewYork','DC','Taipei']:
    print(city)
    pipeline(city)

NewYork
Initialize
saved pca matrix
n_component 1
n_component 2
n_component 3
n_component 4
n_component 5
