In [46]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import geopandas as gpd
from geopy.distance import distance,geodesic
import swifter
from joblib import Parallel, delayed


In [47]:
data = pd.read_csv('date-hour-soo-dest-2022.csv.gz',compression='gzip',header=None)
data.columns = ['Date', 'Hour', 'Origin', 'Destination', 'Number']

In [48]:
data['Month'] = data['Date'].apply(lambda x: x.split('-')[1])
data = data.loc[data['Month'].isin(['09','10'])]
data['OD'] = data['Origin'] + ' - ' + data['Destination']

In [49]:
data = pd.pivot_table(data,index=['Date','Hour'],columns=['OD'],fill_value=0).reset_index()
data.columns = [i[1] if i[0]=='Number' else i[0] for i in data.columns]

In [51]:

data = data.merge(pd.DataFrame({'Date':sorted(data['Date'].unique().tolist()*24),
                         'Hour':list(range(0,24))*len(data['Date'].unique())}),
                  on=['Date','Hour'],how='outer').\
fillna(0).sort_values(by=['Date','Hour'])

data[data.columns[2:]] = data[data.columns[2:]].astype('float16')

In [52]:
od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])


outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

flow = incoming_flow.merge(outgoing_flow,on=['Date','Hour','station'])
flow.pivot_table(values=['incoming_flow','outgoing_flow'],index=['Date','Hour'],
                columns = 'station').\
to_csv('/home/mingyi/Desktop/Dropbox/2022Fall/StemGNN/dataset/inout.csv',header=False,
      index=False)

# different methods, all tested on 10-02

In [53]:
### same as 1 hours before
print(r2_score(data.iloc[-25:-1,2:],data.iloc[-24:,2:],multioutput='variance_weighted'))
print(r2_score(data.iloc[-25:-1,2:],data.iloc[-24:,2:]))

0.2788768235992482
-0.4586509667001619


In [54]:
### same as 24 hour ago
print(r2_score(data.iloc[-48:-24,2:],data.iloc[-24:,2:],multioutput='variance_weighted'))
print(r2_score(data.iloc[-48:-24,2:],data.iloc[-24:,2:]))

0.39728020854757473
-0.5277207904863108


In [37]:
### lag linear regression, many to one
data_melt = data.melt(id_vars=['Date','Hour']).fillna(0)
 
no_lag = 24
for lag in range(1,no_lag+1):
    temp = data_melt[['variable','value']].shift(lag)
    temp.columns = ['variable'+'-'+str(lag),'value'+'-'+str(lag)]
    data_melt = pd.concat([data_melt,temp],axis=1)

data_melt = data_melt.dropna()
data_melt = data_melt.loc[data_melt['variable']==data_melt['variable'+'-'+str(lag)]]

x = data_melt[[col for col in data_melt.columns if 'value-' in col]]
y = data_melt['value']

x_train = x.iloc[1:-24*2500,:].values
y_train = y.iloc[1:-24*2500,].values.reshape(-1, 1)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*2500:,:].values
y_test = y.iloc[-24*2500:].values.reshape(-1, 1)

x_test = scaler.transform(x_test)
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(r2_score(y_pred.reshape(24,2500),y_test.reshape(24,2500),multioutput='variance_weighted'))


out of sample R2
0.6834917189488803
0.6724154199774338


In [38]:
### lag linear regression, many to many

no_lag = 24

x = pd.DataFrame()
for lag in range(1,no_lag+1):
    x = pd.concat([x,data.drop(columns=['Date','Hour']).shift(lag)],axis=1)

x_train = x.iloc[no_lag:-24,:].values
y_train = data.drop(columns=['Date','Hour']).iloc[no_lag:-24,:].values

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

model = LinearRegression().fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24:,:].values
y_test = data.drop(columns=['Date','Hour']).iloc[-24:,:].values

x_test = scaler.transform(x_test)
y_pred = model.predict(x_test)
print(r2_score(y_pred,y_test))
print(r2_score(y_pred,y_test,multioutput='variance_weighted'))

out of sample R2
-5.640509084550648
0.23011635281401716


## adding D-O, and lag

In [83]:

od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])
od_flow = od_flow.merge(od_flow[['Date','Hour','o','d','value']],
                        left_on=['Date','Hour','o','d'],
                        right_on=['Date','Hour','d','o'])
od_flow = od_flow[['Date', 'Hour', 'variable', 'value_x', 'value_y']]
od_flow.columns = ['Date','Hour','variable','value','r_value']
# normalization
od_flow['r_value'] = (od_flow['r_value']+0.01)/(od_flow['value']+0.01)

In [84]:
### lag linear regression,
 
no_lag = 24
for lag in range(1,no_lag+1):
    temp = od_flow[['variable','value','r_value']].shift(lag)
    temp.columns = ['variable'+'-'+str(lag),'value'+'-'+str(lag),'r_value'+'-'+str(lag)]
    od_flow = pd.concat([od_flow,temp],axis=1)

od_flow = od_flow.dropna()
od_flow = od_flow.loc[od_flow['variable']==od_flow['variable'+'-'+str(lag)]]

x = od_flow[[col for col in od_flow.columns if '-' in col and 'variable' not in col]]
y = od_flow['value']

# train on 09-01 to 10-01
x_train = x.iloc[1:-24*2500,:].values
y_train = y.iloc[1:-24*2500,].values.reshape(-1, 1)

# scaler = StandardScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')

#test on 10-02
x_test = x.iloc[-24*2500:,:].values
y_test = y.iloc[-24*2500:].values.reshape(-1, 1)

# x_test = scaler.transform(x_test)
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(r2_score(y_pred.reshape(24,2500),y_test.reshape(24,2500),multioutput='variance_weighted'))


out of sample R2
0.7867947619965633
0.7793217418795463


In [85]:
mean_absolute_error(y_pred.reshape(24,2500),y_test.reshape(24,2500))

0.96410334

In [86]:
mean_absolute_percentage_error(y_pred.reshape(24,2500),y_test.reshape(24,2500))

8.0011425

## add inflow to O, DO flows, and lag

In [112]:

od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])


outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])

od_flow['incoming_flow'] = (od_flow['incoming_flow']+0.1)/(od_flow['value']+0.1)


od_flow = od_flow.merge(od_flow[['Date','Hour','o','d','value']],
                        left_on=['Date','Hour','o','d'],
                        right_on=['Date','Hour','d','o'])
od_flow = od_flow[['Date', 'Hour', 'variable', 'incoming_flow','value_x', 'value_y']]
od_flow.columns = ['Date','Hour','variable','incoming_flow','value','r_value']

od_flow['r_value'] = (od_flow['r_value']+0.01)/(od_flow['value']+0.01)



In [115]:
### lag linear regression,
od_flow = od_flow.sort_values(by=['variable','Date','Hour'])
no_lag = 24
for lag in range(1,no_lag+1):
    temp = od_flow[['variable','value','incoming_flow']].shift(lag)
    temp.columns = ['variable'+'-'+str(lag),'value'+'-'+str(lag),'incoming_flow'+'-'+str(lag)]
    od_flow = pd.concat([od_flow,temp],axis=1)

od_flow = od_flow.dropna()
od_flow = od_flow.loc[od_flow['variable']==od_flow['variable'+'-'+str(lag)]]

x = od_flow[[col for col in od_flow.columns if '-' in col and 'variable' not in col]]
y = od_flow['value']

x_train = x.iloc[1:-24*2500,:].values
y_train = y.iloc[1:-24*2500,].values.reshape(-1, 1)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*2500:,:].values
y_test = y.iloc[-24*2500:].values.reshape(-1, 1)

y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(r2_score(y_pred.reshape(24,2500),y_test.reshape(24,2500),multioutput='variance_weighted'))


out of sample R2
0.7875516294597433
0.780110770769808


In [116]:
mean_absolute_error(y_pred.reshape(24,2500),y_test.reshape(24,2500))

0.98998946

In [117]:
mean_absolute_percentage_error(y_pred.reshape(24,2500),y_test.reshape(24,2500))

4.8599854

## add Incoming/Outgoing flow of OD, and lag


In [118]:
od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])
outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

In [175]:
od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'o_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'d_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'o_outgoing_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'d_outgoing_flow'},inplace=True)
del od_flow['station']

od_flow['o_incoming_flow'] = (od_flow['o_incoming_flow']+0.1)/(od_flow['value']+0.1)
od_flow['d_incoming_flow'] = (od_flow['d_incoming_flow']+0.1)/(od_flow['value']+0.1)
od_flow['o_outgoing_flow'] = (od_flow['o_outgoing_flow']+0.1)/(od_flow['value']+0.1)
od_flow['d_outgoing_flow'] = (od_flow['d_outgoing_flow']+0.1)/(od_flow['value']+0.1)


od_flow = od_flow.sort_values(by=['variable','Date','Hour'])

In [176]:
### lag linear regression,
 
no_lag = 24
selected_variables = ['variable','value',
                    'o_incoming_flow','d_incoming_flow',
                   'o_outgoing_flow','d_outgoing_flow']
for lag in range(1,no_lag+1):
    temp = od_flow[selected_variables].shift(lag)
    temp.columns = [i+'-'+str(lag) for i in selected_variables]
    od_flow = pd.concat([od_flow,temp],axis=1)

od_flow = od_flow.dropna()
od_flow = od_flow.loc[od_flow['variable']==od_flow['variable'+'-'+str(lag)]]

x = od_flow[[col for col in od_flow.columns if '-' in col and 'variable' not in col]]
y = od_flow['value']

x_train = x.iloc[1:-24*2500,:].values
y_train = y.iloc[1:-24*2500,].values.reshape(-1, 1)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*2500:,:].values
y_test = y.iloc[-24*2500:].values.reshape(-1, 1)

y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(r2_score(y_pred.reshape(24,2500),y_test.reshape(24,2500),multioutput='variance_weighted'))


out of sample R2
0.7681850454681229
0.760741038231203


## add spatial and temporal lag

In [37]:
import fiona

gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
gdf = gpd.read_file('doc.kml', driver='KML')

station_name = pd.read_excel('Station_Names.xls')

In [38]:
station_latlon = dict(zip(gdf['Name'],zip(gdf.geometry.y,gdf.geometry.x)))
station_latlon

{'Milpitas': (37.41028235996109, -121.8910940840804),
 'Berryessa/North San Jose': (37.36847828910375, -121.8746938431299),
 'Civic Center/UN Plaza': (37.77939386076292, -122.4139364152155),
 'Concord': (37.97382498587121, -122.0290620224699),
 'Colma': (37.68462160749755, -122.4662231559497),
 'El Cerrito Plaza': (37.90263522650326, -122.2989484071957),
 'Daly City': (37.70633321337765, -122.4689119071663),
 'North Concord/Martinez': (38.00332451679782, -122.0245523142489),
 'Fremont': (37.55747442735056, -121.9766263951319),
 'Glen Park': (37.73311239451958, -122.4337900883828),
 'Hayward': (37.66969738355485, -122.0869599532775),
 'El Cerrito del Norte': (37.92519241672761, -122.3169151214355),
 'Coliseum': (37.75368874735485, -122.1968592134024),
 'Downtown Berkeley': (37.870316112485, -122.2681588195891),
 'Bay Fair': (37.69691620335457, -122.1264596293894),
 'Balboa Park': (37.72135058802746, -122.4475873574918),
 'San Bruno': (37.63770374268261, -122.416266663064),
 'South Haywa

In [39]:
code_name = {'12TH':'12th St/Oakland City Center',
 '16TH':'16th St/Mission',
 '19TH':'19th St/Oakland',
 '24TH':'24th St/Mission',
 'ANTC':'Antioch',
 'ASHB':'Ashby',
 'BALB':'Balboa Park',
 'BAYF':'Bay Fair',
 'BERY':'Berryessa/North San Jose',
 'CAST':'Castro Valley',
 'CIVC':'Civic Center/UN Plaza',
 'COLM':'Colma',
 'COLS':'Coliseum/Airport Connector',
 'CONC':'Concord',
 'DALY':'Daly City',
 'DBRK':'Downtown Berkeley',
 'DELN':'El Cerrito del Norte',
 'DUBL':'Dublin/Pleasanton',
 'EMBR':'Embarcadero',
 'FRMT':'Fremont',
 'FTVL':'Fruitvale',
 'GLEN':'Glen Park',
 'HAYW':'Hayward',
 'LAFY':'Lafayette',
 'LAKE':'Lake Merritt',
 'MCAR':'MacArthur',
 'MLBR':'Millbrae',
 'MLPT':'Milpitas',
 'MONT':'Montgomery St',
 'NBRK':'North Berkeley',
 'NCON':'North Concord/Martinez',
 'OAKL':'Oakland International Airport',
 'ORIN':'Orinda',
 'PCTR':'Pittsburg Center',
 'PHIL':'Pleasant Hill/Contra Costa Centre',
 'PITT':'Pittsburg/Bay Point',
 'PLZA':'El Cerrito Plaza',
 'POWL':'Powell St',
 'RICH':'Richmond',
 'ROCK':'Rockridge',
 'SANL':'San Leandro',
 'SBRN':'San Bruno',
 'SFIA':'San Francisco International Airport',
 'SHAY':'South Hayward',
 'SSAN':'South San Francisco',
 'UCTY':'Union City',
 'WARM':'Warm Springs/South Fremont',
 'WCRK':'Walnut Creek',
 'WDUB':'West Dublin/Pleasanton',
 'WOAK':'West Oakland'}

In [40]:
station_distance = pd.DataFrame()
pair_list = []
o_list = []
d_list = []
distance_list = []
for o in code_name.keys():
    for d in code_name.keys():
        o_name = code_name[o]
        d_name = code_name[d]
        pair = o + ' - ' + d
        pair_list += [pair]
        o_list += [o]
        d_list += [d]
        dist = geodesic(station_latlon[o_name],station_latlon[d_name]).miles
        distance_list += [dist]
station_distance['o'] = o_list
station_distance['d'] = d_list
station_distance['pair'] = pair_list
station_distance['dist'] = distance_list

## O/D incoming/outgoing flow within 1 mi

In [41]:
station_distance_1mi = station_distance.loc[(station_distance['dist']<1)&(station_distance['dist']>0)]
station_distance_1mi.head()

Unnamed: 0,o,d,pair,dist
2,12TH,19TH,12TH - 19TH,0.376426
24,12TH,LAKE,12TH - LAKE,0.559623
53,16TH,24TH,16TH - 24TH,0.886955
100,19TH,12TH,19TH - 12TH,0.376426
124,19TH,LAKE,19TH - LAKE,0.794901


In [89]:
od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])
outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'o_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'d_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'o_outgoing_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'d_outgoing_flow'},inplace=True)
del od_flow['station']

In [96]:
inoutflow = incoming_flow.merge(outgoing_flow,on=['Date','Hour','station'])

In [42]:
nearby_stations = {}
for station in code_name.keys():
    nearby = station_distance_1mi.loc[station_distance_1mi['o']==station]['d'].values.tolist()
    nearby_stations[station] = nearby


In [97]:
inoutflow.head()

Unnamed: 0,Date,Hour,station,incoming_flow,outgoing_flow
0,2022-09-01,0,12TH,21,7
1,2022-09-01,0,16TH,38,85
2,2022-09-01,0,19TH,17,16
3,2022-09-01,0,24TH,48,24
4,2022-09-01,0,ANTC,29,5


In [107]:
def get_nearby_flow(x,inoutflow,nearby_stations):
    date = x['Date']
    hour = x['Hour']
    o = x['o']
    d = x['d']
    temp = inoutflow.loc[(inoutflow['Date']==date)&\
                                    (inoutflow['Hour']==hour)]
    o_nearby_flows = temp.loc[(temp['station'].isin(nearby_stations[o]))]\
                                    [['incoming_flow','outgoing_flow']].sum().values.tolist()
    d_nearby_flows = temp.loc[(temp['station'].isin(nearby_stations[d]))]\
                                    [['incoming_flow','outgoing_flow']].sum().values.tolist()
    
    return o_nearby_flows+d_nearby_flows
    

In [109]:
nearby_flow_list = Parallel(n_jobs=8)(delayed(get_nearby_flow)(od_flow.iloc[i],inoutflow,nearby_stations) for i in range(len(od_flow)))

In [111]:
od_flow[['o_nearby_incoming_list_1mi','o_nearby_outgoing_list_1mi',
         'd_nearby_incoming_list_1mi','d_nearby_outgoing_list_1mi']] = nearby_flow_list

In [114]:
od_flow.to_csv('processed_time_spatial_lag.csv',index=False)

In [177]:
od_flow = pd.read_csv('processed_time_spatial_lag.csv') 

for col in ['o_incoming_flow',
       'd_incoming_flow', 'o_outgoing_flow', 'd_outgoing_flow',
       'o_nearby_incoming_list_1mi', 'o_nearby_outgoing_list_1mi',
       'd_nearby_incoming_list_1mi', 'd_nearby_outgoing_list_1mi']:
    od_flow[col] = (od_flow[col]+0.1)/(od_flow['value']+0.1)
od_flow = od_flow.sort_values(by=['variable','Date','Hour'])

In [178]:
target = ['value', 'o_incoming_flow',
       'd_incoming_flow', 'o_outgoing_flow', 'd_outgoing_flow',
       'o_nearby_incoming_list_1mi', 'o_nearby_outgoing_list_1mi',
       'd_nearby_incoming_list_1mi', 'd_nearby_outgoing_list_1mi']

In [179]:
### time lag linear regression, many to one
# lag_target = ['variable', 'value', 'o_incoming_flow',
#        'd_incoming_flow', 'o_outgoing_flow', 'd_outgoing_flow',
#        'o_nearby_incoming_list_1mi', 'o_nearby_outgoing_list_1mi',
#        'd_nearby_incoming_list_1mi', 'd_nearby_outgoing_list_1mi']

lag_target = ['variable', 'value', 'o_incoming_flow',      
       'o_nearby_incoming_list_1mi']
no_lag = 24
for lag in range(1,no_lag+1):
    temp = od_flow[lag_target].shift(lag)
    temp.columns = [i+'-lag-'+str(lag) for i in lag_target]
    temp[['Date','Hour']] = od_flow[['Date','Hour']]
    od_flow = temp.merge(od_flow,left_on=['Date','Hour','variable-lag-'+str(lag)],
                         right_on=['Date','Hour','variable'])

od_flow = od_flow.dropna()

In [180]:
x_target = [col for col in od_flow.columns if '-' in col and 'variable' not in col]
x = od_flow[x_target]
y = od_flow['value']

x_train = x.iloc[:-24*2500,:].values
y_train = y.iloc[:-24*2500,].values.reshape(-1, 1)

# scaler = StandardScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*2500:,:].values
y_test = y.iloc[-24*2500:].values.reshape(-1, 1)

# x_test = scaler.transform(x_test)
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(r2_score(y_pred.reshape(24,2500),y_test.reshape(24,2500),multioutput='variance_weighted'))


out of sample R2
0.7420928473018202
0.729924210378433


#### incoming/outgoing of O/D and nearby OD
lag 24, variance weighted R2 = 0.63

lag 12, variance weighted R2 = 0.67

lag 6, variance weighted R2 = 0.669

lag 3, variance weighted R2 = 0.700

## O/D flow start and end within 1 mi

In [43]:
od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])
outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'o_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(incoming_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'incoming_flow':'d_incoming_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','o'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'o_outgoing_flow'},inplace=True)
del od_flow['station']
od_flow = od_flow.merge(outgoing_flow,left_on=['Date','Hour','d'],right_on=['Date','Hour','station'])
od_flow.rename(columns={'outgoing_flow':'d_outgoing_flow'},inplace=True)
del od_flow['station']

In [44]:
def get_nearby_odflow(x,od_flow,nearby_stations):
    date = x['Date']
    hour = x['Hour']
    o = x['o']
    d = x['d']
    temp = od_flow.loc[(od_flow['Date']==date)&\
                                    (od_flow['Hour']==hour)]
    flows = temp.loc[(temp['o'].isin(nearby_stations[o]))&(temp['d'].isin(nearby_stations[d]))]\
                                [['value']].sum().values[0]

    
    return flows

In [None]:
# this function have run more than 7 days on my PC
# I just give it up
nearby_odflow_list = Parallel(n_jobs=4)(delayed(get_nearby_odflow)(od_flow.iloc[i],od_flow,nearby_stations) for i in range(len(od_flow)))

In [None]:
od_flow['nearby_odflow'] = nearby_odflow_list
od_flow['nearby_odflow'] = (od_flow['nearby_odflow']+0.0001)/(od_flow['value']+0.0001)