In [54]:
#add necessary libraries
import networkx as nx
import pandas as pd
import numpy as np
import os
from sklearn.mixture import GaussianMixture 
from sklearn.decomposition import PCA
import datetime
import operator
import warnings
warnings.filterwarnings('ignore')

In [55]:
RecordWritingPath = '../result/'
TransportationDataPath = '../transportation/'
EventDataPath = '../events/'
comboPath = '../combo/'

In [56]:
if os.path.exists(TransportationDataPath+'/output')==False:
    os.makedirs(TransportationDataPath+'/output')
    for method in ['Comm','IO']:
        os.makedirs(TransportationDataPath+'/output/'+method)
        for size in ['PCA','OriginSize']:
            os.makedirs(TransportationDataPath+'/output/'+method+'/'+size)
            for stand in ['Standardize','Whiten','Both']:
                os.makedirs(TransportationDataPath+'/output/'+method+'/'+size+'/'+stand)


In [57]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='amount', index=['date'],
                    columns=['start_id','end_id'], aggfunc=np.sum, fill_value=0).sort_index()
    return table

In [58]:
def standardize(city, raw, DimensionReduction):
    if raw == 'Comm':
        data = pd.read_csv(TransportationDataPath+'Comm/'+city+'DateWiseComm.csv')
        data = getTimeSeries(data)
        data = data.sort_index()
        matrix = data.values
        
    elif raw == 'IO':
        data = pd.read_csv(TransportationDataPath+city+'InOutFlow.csv', date_parser='date')
        data = data.sort_values(['date'])
        data.index = data.date
        del data['date']
        matrix = data.values
    
    matrix = np.log(matrix+1)
    
#     norm column
    for i in range(matrix.shape[1]):
        matrix[:, i] = (matrix[:, i] - matrix[:, i].min()) / (matrix[:, i].max() - matrix[:, i].min())
    if DimensionReduction is True:
        pca = PCA(n_components=16)    
        matrix=pca.fit_transform(matrix)
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/PCA/Standardize/'+city+raw+'Standardize.csv',index=False)
    else:
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/OriginSize/Standardize/'+city+raw+'Standardize.csv',index=False)

#     norm row
    for i in range(matrix.shape[0]):
        matrix[i, :] = (matrix[i, :] - matrix[i, :].min()) / (matrix[i, :].max() - matrix[i, :].min())
    if DimensionReduction is True:
        pca = PCA(n_components=16)    
        matrix=pca.fit_transform(matrix)
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/PCA/Whiten/'+city+raw+'Whiten.csv',index=False)
    else:
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/OriginSize/Whiten/'+city+raw+'Whiten.csv',index=False)

    matrix = (matrix - matrix.min()) / (matrix.max() - matrix.min())
    if DimensionReduction is True:
        pca = PCA(n_components=16)    
        matrix=pca.fit_transform(matrix)
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/PCA/Both/'+city+raw+'Both.csv',index=False)
    else:
        df = pd.DataFrame(matrix)
        df['date'] = data.index
        df.to_csv(TransportationDataPath+'output/'+raw+'/OriginSize/Both/'+city+raw+'Both.csv',index=False)
    print(city,raw,'PCA',DimensionReduction,'done')

In [59]:
for city in ['NewYork','Taipei']:
    print(city)
    standardize(city, 'Comm', DimensionReduction=False)
    standardize(city, 'Comm', DimensionReduction=True)
    standardize(city, 'IO', DimensionReduction=False)
    standardize(city, 'IO', DimensionReduction=True)

NewYork
NewYork Comm PCA False done
NewYork Comm PCA True done
NewYork IO PCA False done
NewYork IO PCA True done
Taipei
Taipei Comm PCA False done
Taipei Comm PCA True done
Taipei IO PCA False done
Taipei IO PCA True done
