In [1]:
import numpy as np
import pandas as pd
from joblib import * 
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import geopandas as gpd
# from geopy.distance import distance,geodesic
from joblib import Parallel, delayed
import warnings
import matplotlib.pyplot as plt
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf,adfuller, kpss,range_unit_root_test
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [250]:
# please download data at https://www.dropbox.com/s/uquijy335rg0kjn/date-hour-soo-dest-2022.csv.gz?dl=0
data = pd.read_csv('date-hour-soo-dest-2022.csv.gz',compression='gzip',header=None)
data.columns = ['Date', 'Hour', 'Origin', 'Destination', 'Number']

In [251]:
data['Month'] = data['Date'].apply(lambda x: x.split('-')[1])
# data = data.loc[data['Month'].isin(['09','10'])]
data['OD'] = data['Origin'] + ' - ' + data['Destination']

In [252]:
data = pd.pivot_table(data,index=['Date','Hour'],columns=['OD'],fill_value=0).reset_index()
data.columns = [i[1] if i[0]=='Number' else i[0] for i in data.columns]

In [253]:

data = data.merge(pd.DataFrame({'Date':sorted(data['Date'].unique().tolist()*24),
                         'Hour':list(range(0,24))*len(data['Date'].unique())}),
                  on=['Date','Hour'],how='outer').\
fillna(0).sort_values(by=['Date','Hour'])

data[data.columns[2:]] = data[data.columns[2:]].astype('float16')

In [None]:
fig,ax = plt.subplots()
for i in data.columns[2:]:
    plt.plot(data['Date'],data[i])

In [None]:
describe = data[data.columns[2:]].describe()
plt.plot(describe.iloc[1])

In [264]:
od_flow = data.melt(id_vars=['Date','Hour']).fillna(0)
od_flow['o'] = od_flow['variable'].apply(lambda x:x.split(' - ')[0])
od_flow['d'] = od_flow['variable'].apply(lambda x:x.split(' - ')[1])


outgoing_flow = od_flow.groupby(['Date','Hour','o']).agg({'value':sum}).reset_index()
outgoing_flow.rename(columns={'o':'station','value':'outgoing_flow'},inplace=True)
incoming_flow = od_flow.groupby(['Date','Hour','d']).agg({'value':sum}).reset_index()
incoming_flow.rename(columns={'d':'station','value':'incoming_flow'},inplace=True)

od = incoming_flow.merge(outgoing_flow,on=['Date','Hour','station'])
od = od.sort_values(by=['Date','Hour','station'])
od['DOW'] = pd.to_datetime(od['Date'])
od['DOW'] = od.DOW.dt.dayofweek
od = pd.concat([od.drop(['DOW'],axis=1),
                     pd.get_dummies(od['DOW'],prefix='dow',prefix_sep='-')],
                   axis=1)
od = pd.concat([od,
                     pd.get_dummies(od['Hour'],prefix='hour',prefix_sep='-')],
                   axis=1)
od = od.sort_values(by=['Date','Hour','station'])
od.to_csv('inoutlong.csv',index=False)

In [186]:
flow = incoming_flow.merge(outgoing_flow,on=['Date','Hour','station'])
od = flow.pivot_table(values=['incoming_flow','outgoing_flow'],index=['Date','Hour'],
                columns = 'station')
col = od.columns
od.columns = [i[0]+'-'+i[1] for i in col]

od.to_csv('inoutwide.csv')

# different methods, all tested on 09-26 to 10-02

In [2]:
def masked_MAPE(v, v_, axis=None):
    '''
    Mean absolute percentage error.
    :param v: np.ndarray or int, ground truth.
    :param v_: np.ndarray or int, prediction.
    :param axis: axis to do calculation.
    :return: int, MAPE averages on all elements of input.
    '''
    mask = (v == 0)
    percentage = np.abs(v_ - v) / np.abs(v)
    if np.any(mask):
        masked_array = np.ma.masked_array(percentage, mask=mask)  # mask the dividing-zero as invalid
        result = masked_array.mean(axis=axis)
        if isinstance(result, np.ma.MaskedArray):
            return result.filled(np.nan)
        else:
            return result
    return np.mean(percentage, axis).astype(np.float64)

In [36]:
### same as a week ago
od = pd.read_csv('inoutlong.csv')
y_pred = od.iloc[-24*7*2*50:-24*7*50,3:5]
y_test = od.iloc[-24*7*50:,3:5]


In [37]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))


print(np.mean(mean_squared_error(y_test,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
y_pred = np.array(y_pred['incoming_flow'].values.tolist()+y_pred['outgoing_flow'].values.tolist())
y_test = np.array(y_test['incoming_flow'].values.tolist()+y_test['outgoing_flow'].values.tolist())
print(masked_MAPE(y_test,y_pred))


0.9489373219533752
19.21297619047619
50.05978277945785
0.22577725021459194
0.31858051751140515


# station-level incoming/outgoing

In [38]:
## lag linear regression, many to one
od = pd.read_csv('inoutlong.csv')
od = od.sort_values(by=['station','Date','Hour'])
no_lag = 24*7

for lag in range(1,no_lag+1):
    temp = od[['station','incoming_flow','outgoing_flow']].shift(lag)
    temp.columns = ['station'+'-'+str(lag),'incoming_flow'+'-'+str(lag),'outgoing_flow'+'-'+str(lag)]
    od = pd.concat([od,temp],axis=1)
    
od = od.sort_values(by=['Date','Hour','station'])
od = od.dropna()
od = od.loc[od['station']==od['station'+'-'+str(lag)]]

x = od[[col for col in od.columns if '_flow-' in col]]
y = od[['incoming_flow','outgoing_flow']]

x_train = x.iloc[:-24*50*7,:].values
y_train = y.iloc[:-24*50*7,].values

# scaler = StandardScaler()
# scaler.fit(x_train)

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*50*7:,:].values
# x_test = scaler.transform(x_test)
y_test = y.iloc[-24*50*7:].values
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(np.mean(mean_squared_error(y_test,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_test,y_pred))



out of sample R2
0.9698645501199167
20.590320932259978
38.16478599960493
0.17222992996315434
0.6901845375521896


In [39]:
## lag linear regression, many to one
od = pd.read_csv('inoutlong.csv')
od = od.sort_values(by=['station','Date','Hour'])
no_lag = 24*7
# 48 continuous hourly lag, 3-7 days ago same hour-of-day lag, 7 weeks ago lag
for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = od[['station','incoming_flow','outgoing_flow']].shift(lag)
    temp.columns = ['station'+'-'+str(lag),'incoming_flow'+'-'+str(lag),'outgoing_flow'+'-'+str(lag)]
    od = pd.concat([od,temp],axis=1)
    
od = od.sort_values(by=['Date','Hour','station'])
od = od.dropna()
od = od.loc[od['station']==od['station'+'-'+str(lag)]]

x = od[[col for col in od.columns if '_flow-' in col]]
y = od[['incoming_flow','outgoing_flow']]

x_train = x.iloc[:-24*50*7,:].values
y_train = y.iloc[:-24*50*7,].values

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*50*7:,:].values
y_test = y.iloc[-24*50*7:].values
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(np.mean(mean_squared_error(y_test,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_test,y_pred))


out of sample R2
0.9717966032934099
18.254343657666197
36.40979942499047
0.16431003189703514
0.548281513983181


## adding  surrounding flows

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.__version__

'1.13.0'

In [406]:
# import torch_geometric
# from torch_geometric.utils import to_dense_adj

od = pd.read_csv('inoutlong.csv')

In [410]:
od.iloc[:,5:] = od.iloc[:,5:].astype('double')

In [208]:
# create adj matrix
stations = od['station'].unique()
stations_index = dict(zip(stations,range(50)))

In [209]:
yellow = ['ANTC','PCTR','PITT','NCON','CONC','PHIL','WCRK','LAFY',
        'ORIN','ROCK','MCAR','19TH','12TH','WOAK','EMBR',
        'MONT','POWL','CIVC','16TH','24TH','GLEN','BALB',
        'DALY','COLM','SSAN','SBRN','MLBR','SFIA']
orange = ['RICH','DELN','PLZA','NBRK','DBRK','ASHB','MCAR',
          '19TH','12TH','LAKE','FTVL','COLS','SANL','BAYF',
          'HAYW','SHAY','UCTY','FRMT','WARM','MLPT','BERY']
red = ['RICH','DELN','PLZA','NBRK','DBRK','ASHB','MCAR',
          '19TH','12TH','WOAK','EMBR',
        'MONT','POWL','CIVC','16TH','24TH','GLEN','BALB',
        'DALY','COLM','SSAN','SBRN','MLBR','SFIA']
blue = ['DUBL','WDUB','CAST','BAYF','SANL','COLS','FTVL',
        'LAKE','WOAK','EMBR','MONT','POWL','CIVC','16TH',
        '24TH','GLEN','BALB','DALY']
green = ['BERY','MLPT','WARM','FRMT','UCTY','SHAY','HAYW',
         'BAYF','SANL','COLS','FTVL',
        'LAKE','WOAK','EMBR','MONT','POWL','CIVC','16TH',
        '24TH','GLEN','BALB','DALY']
grey = ['COLS','OAKL']

In [210]:
adj = np.zeros([50,50])
for route in [yellow,orange,red,blue,green,grey]:
    i = 0
    for station in route:
        if i+1 < len(route):
            pair1 = stations_index[route[i]]
            pair2 = stations_index[route[i+1]]
            adj[pair1,pair2] = adj[pair1,pair2]+1
            adj[pair2,pair1] = adj[pair2,pair1]+1
            i += 1

In [211]:
connection = {}
for route in [yellow,orange,red,blue,green,grey]:
    i = 0
    for station in route:
        if i+1 < len(route):
            key = route[i]
            value = route[i+1]
            connection[key] = connection.get(key,[])+[value]
            connection[value] = connection.get(value,[])+[key]
            i += 1
for key in connection.keys():
    connection[key] = list(set(connection[key]))

In [44]:
def get_nearby_flow(x,od,connection):
    date = x['Date']
    hour = x['Hour']
    source = x['station']
    
    temp = od.loc[(od['Date']==date)&\
                                    (od['Hour']==hour)]
    nearby_flows = temp.loc[(temp['station'].isin(connection[source]))]\
                                    [['incoming_flow','outgoing_flow']].sum().values.tolist()

    
    return nearby_flows
    

In [78]:
nearby_flow_list = Parallel(n_jobs=8)(delayed(get_nearby_flow)(od.iloc[i],od,connection) for i in range(len(od)))
od[['nearby_incoming','nearby_outgoing']] = nearby_flow_list
# od['nearby_incoming'] = (od['nearby_incoming']+1e-6)/(od['incoming_flow']+1e-6)
# od['nearby_outgoing'] = (od['nearby_outgoing']+1e-6)/(od['incoming_flow']+1e-6)
od.to_csv('inoutlongNear.csv',index=False)

In [4]:
od = pd.read_csv('inoutlongNear.csv')
od['month'] = od['Date'].apply(lambda x: x.split('-')[1])
od = od.loc[od['month'].isin(['09','10'])]
del od['month']

In [417]:
od = pd.read_csv('inoutlongNear.csv')
od['month'] = od['Date'].apply(lambda x: x.split('-')[1])
od = od.loc[od['month'].isin(['09','10'])]
del od['month']
od = od.sort_values(by=['station','Date','Hour'])
no_lag = 24*7
for lag in list(range(1,168)):#+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = od[['station','incoming_flow','outgoing_flow','nearby_incoming','nearby_outgoing']].shift(lag)
    temp.columns = ['station'+'-'+str(lag),
                    'incoming_flow'+'-'+str(lag),'outgoing_flow'+'-'+str(lag),
                    'nearby_incoming'+'-'+str(lag),'nearby_outgoing'+'-'+str(lag)]
    od = pd.concat([od,temp],axis=1)
#     od['incoming_flow'+'-'+str(lag)] = (od['incoming_flow'+'-'+str(lag)]+1e-5)/(od['incoming_flow']+1e-5)
#     od['outgoing_flow'+'-'+str(lag)] = (od['outgoing_flow'+'-'+str(lag)]+1e-5)/(od['incoming_flow']+1e-5)
#     od['nearby_incoming'+'-'+str(lag)] = (od['nearby_incoming'+'-'+str(lag)]+1e-5)/(od['incoming_flow']+1e-5)
#     od['nearby_outgoing'+'-'+str(lag)] = (od['nearby_outgoing'+'-'+str(lag)]+1e-5)/(od['incoming_flow']+1e-5)
    
od = od.sort_values(by=['Date','Hour','station'])
od = od.dropna()
od = od.loc[od['station']==od['station'+'-'+str(lag)]]
od = od.drop(columns=[i for i in od.columns if 'station-' in i])

In [418]:
x = od[od.columns[5:]]
y = od[['incoming_flow','outgoing_flow']]

x_train = x.iloc[:-24*50*7,:].values
y_train = y.iloc[:-24*50*7,].values

model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

print('out of sample R2')
x_test = x.iloc[-24*50*7:,:].values
y_test = y.iloc[-24*50*7:].values
y_pred = model.predict(x_test)

print(r2_score(y_pred,y_test))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(mean_squared_error(y_test,y_pred,squared=False)/np.mean(np.var(y_pred)))
print(masked_MAPE(y_test,y_pred))



out of sample R2
0.9699310583412695
21.210490879505542
38.25263828654556
0.0007832197277821417
0.79857519427667


In [85]:
# regularization, LASSO
from sklearn.linear_model import MultiTaskLasso
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

parameters = {'alpha':np.arange(0.01,1,0.1), 'fit_intercept':[True]}
lasso = MultiTaskLasso(max_iter=1000)
lassocv = GridSearchCV(lasso, parameters,n_jobs=-1,scoring='neg_mean_absolute_error',verbose=3)
lassocv.fit(x_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(


[CV 1/5] END ..alpha=0.11, fit_intercept=True;, score=-12.811 total time= 4.9min
[CV 5/5] END ..alpha=0.11, fit_intercept=True;, score=-17.818 total time= 4.2min
[CV 4/5] END alpha=0.31000000000000005, fit_intercept=True;, score=-15.446 total time= 4.3min
[CV 2/5] END ..alpha=0.51, fit_intercept=True;, score=-14.722 total time=20.2min
[CV 4/5] END alpha=0.6100000000000001, fit_intercept=True;, score=-15.437 total time=24.5min
[CV 3/5] END ..alpha=0.81, fit_intercept=True;, score=-16.305 total time=24.6min
[CV 4/5] END ..alpha=0.01, fit_intercept=True;, score=-15.460 total time= 5.2min
[CV 1/5] END alpha=0.31000000000000005, fit_intercept=True;, score=-12.813 total time= 4.4min
[CV 4/5] END alpha=0.41000000000000003, fit_intercept=True;, score=-15.443 total time= 4.3min
[CV 1/5] END alpha=0.6100000000000001, fit_intercept=True;, score=-12.805 total time=21.2min
[CV 5/5] END alpha=0.7100000000000001, fit_intercept=True;, score=-17.802 total time=23.4min
[CV 4/5] END ..alpha=0.81, fit_int

  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(


[CV 5/5] END ..alpha=0.01, fit_intercept=True;, score=-17.838 total time= 5.1min
[CV 4/5] END alpha=0.21000000000000002, fit_intercept=True;, score=-15.449 total time= 4.2min
[CV 5/5] END alpha=0.31000000000000005, fit_intercept=True;, score=-17.818 total time= 4.3min
[CV 3/5] END ..alpha=0.51, fit_intercept=True;, score=-16.316 total time=20.7min
[CV 3/5] END alpha=0.7100000000000001, fit_intercept=True;, score=-16.309 total time=25.2min
[CV 2/5] END ..alpha=0.91, fit_intercept=True;, score=-14.708 total time=37.3min


  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(


[CV 1/5] END ..alpha=0.01, fit_intercept=True;, score=-12.861 total time= 5.1min
[CV 5/5] END alpha=0.21000000000000002, fit_intercept=True;, score=-17.823 total time= 4.4min
[CV 2/5] END alpha=0.41000000000000003, fit_intercept=True;, score=-14.726 total time= 4.3min
[CV 4/5] END ..alpha=0.51, fit_intercept=True;, score=-15.440 total time=20.6min
[CV 4/5] END alpha=0.7100000000000001, fit_intercept=True;, score=-15.434 total time=24.9min
[CV 1/5] END ..alpha=0.91, fit_intercept=True;, score=-12.797 total time=37.8min


  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(


[CV 2/5] END ..alpha=0.01, fit_intercept=True;, score=-14.746 total time= 5.0min
[CV 2/5] END alpha=0.21000000000000002, fit_intercept=True;, score=-14.733 total time= 4.5min
[CV 3/5] END alpha=0.41000000000000003, fit_intercept=True;, score=-16.319 total time= 4.4min
[CV 2/5] END alpha=0.6100000000000001, fit_intercept=True;, score=-14.719 total time=20.3min
[CV 1/5] END alpha=0.7100000000000001, fit_intercept=True;, score=-12.802 total time=25.4min
[CV 3/5] END ..alpha=0.91, fit_intercept=True;, score=-16.302 total time=37.6min


  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(
  ) = cd_fast.enet_coordinate_descent_multi_task(


In [400]:
y_pred = lassocv.best_estimator_.predict(x_test)
print(r2_score(y_pred,y_test))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(mean_squared_error(y_test,y_pred,squared=False)/np.mean(np.var(y_pred)))
print(masked_MAPE(y_test,y_pred))


0.977831859107926
16.67924115477344
32.58685761143001
0.0006799193178248865
0.49581757448573666


In [88]:
# PCA
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()

# set the tolerance to a large value to make the example faster
regressor = LinearRegression()
pipe = Pipeline(steps=[("pca", pca), ("regressor", regressor)])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    "pca__n_components": np.arange(2,705,10),
}
search = GridSearchCV(pipe, param_grid,n_jobs=-1,scoring='neg_mean_absolute_error',verbose=3)
search.fit(x_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 71 candidates, totalling 355 fits
Best parameter (CV score=-15.442):
{'pca__n_components': 242}
[CV 1/5] END .............pca__n_components=2;, score=-54.794 total time=   8.8s
[CV 5/5] END ............pca__n_components=12;, score=-45.330 total time=   7.2s
[CV 3/5] END ............pca__n_components=32;, score=-25.062 total time=  10.1s
[CV 1/5] END ............pca__n_components=52;, score=-16.309 total time=  12.6s
[CV 4/5] END ............pca__n_components=62;, score=-19.636 total time=   9.9s
[CV 2/5] END ............pca__n_components=82;, score=-16.998 total time=  14.1s
[CV 4/5] END ............pca__n_components=92;, score=-17.516 total time=  17.2s
[CV 1/5] END ...........pca__n_components=112;, score=-13.178 total time=  15.7s
[CV 5/5] END ...........pca__n_components=122;, score=-18.614 total time=  22.5s
[CV 2/5] END ...........pca__n_components=142;, score=-15.029 total time=  23.2s
[CV 1/5] END ...........pca__n_components=162;, score=-13.023 tota

[CV 3/5] END .............pca__n_components=2;, score=-66.836 total time=   9.9s
[CV 2/5] END ............pca__n_components=22;, score=-28.641 total time=   9.5s
[CV 4/5] END ............pca__n_components=32;, score=-24.957 total time=   9.4s
[CV 2/5] END ............pca__n_components=52;, score=-18.079 total time=  11.1s
[CV 5/5] END ............pca__n_components=62;, score=-22.045 total time=  11.5s
[CV 3/5] END ............pca__n_components=82;, score=-19.743 total time=  14.1s
[CV 1/5] END ...........pca__n_components=102;, score=-13.786 total time=  16.8s
[CV 4/5] END ...........pca__n_components=112;, score=-16.341 total time=  16.7s
[CV 2/5] END ...........pca__n_components=132;, score=-15.026 total time=  24.4s
[CV 5/5] END ...........pca__n_components=142;, score=-18.561 total time=  22.2s
[CV 3/5] END ...........pca__n_components=162;, score=-16.836 total time=  26.4s
[CV 1/5] END ...........pca__n_components=182;, score=-13.029 total time=  25.8s
[CV 4/5] END ...........pca_

[CV 5/5] END .............pca__n_components=2;, score=-72.516 total time=  10.0s
[CV 3/5] END ............pca__n_components=22;, score=-32.091 total time=   9.4s
[CV 5/5] END ............pca__n_components=32;, score=-27.231 total time=  10.1s
[CV 3/5] END ............pca__n_components=52;, score=-21.005 total time=  10.5s
[CV 1/5] END ............pca__n_components=72;, score=-15.121 total time=  12.0s
[CV 4/5] END ............pca__n_components=82;, score=-18.357 total time=  14.6s
[CV 3/5] END ...........pca__n_components=102;, score=-18.405 total time=  18.7s
[CV 1/5] END ...........pca__n_components=122;, score=-13.075 total time=  17.9s
[CV 4/5] END ...........pca__n_components=132;, score=-16.135 total time=  23.6s
[CV 2/5] END ...........pca__n_components=152;, score=-15.176 total time=  28.4s
[CV 5/5] END ...........pca__n_components=162;, score=-18.375 total time=  27.3s
[CV 3/5] END ...........pca__n_components=182;, score=-16.748 total time=  28.0s
[CV 1/5] END ...........pca_

In [402]:
y_pred = search.best_estimator_.predict(x_test)
print(r2_score(y_pred,y_test))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(mean_squared_error(y_test,y_pred,squared=False)/np.mean(np.var(y_pred)))
print(masked_MAPE(y_test,y_pred))


0.9778222058876295
16.706370517917016
32.59769746538075
0.0006799861722539143
0.49722418116913797


In [3]:
class VNN(nn.Module):
    def __init__(self, input_size, n_feature, output_size):
        super(VNN, self).__init__()
        self.layer1 = nn.Sequential(
        nn.Linear(input_size, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,output_size),
        )
 
    def forward(self, x, verbose=False):
        x = self.layer1(x)        
        return x

def get_loss_and_metrics(model, batch, criterion, device):
  # Implement forward pass and loss calculation for one batch.
  # Remember to move the batch to device.
  # 
  # Return a tuple:
  # - loss for the batch (Tensor)
  # - number of correctly classified examples in the batch (Tensor)
    data, target = batch[0], batch[1]
    data = torch.tensor(data, dtype=torch.float32)
    target = torch.tensor(target, dtype=torch.float32)
    
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    pred = model(data)
    loss = criterion(pred, target)
    
    
    return (pred,target,loss)
    
def step(loss, optimizer):
  # Implement backward pass and update.

  # TODO
    loss = loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [5]:
fts = od.iloc[:-24*50*7,5:].values
scaler = StandardScaler()
scaler.fit(fts)
fts = torch.tensor(scaler.transform(fts))
target = torch.tensor(od.iloc[:-24*50*7,3:5].values)                      
train_dataset = torch.utils.data.TensorDataset(fts,target)
                      
fts_val = torch.tensor(scaler.transform(od.iloc[-24*50*7:,5:].values))                   
target_val = torch.tensor(od.iloc[-24*50*7:,3:5].values)                      
validation_dataset = torch.utils.data.TensorDataset(fts_val, target_val)

In [6]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device("mps")

N_EPOCHS = 5000
BATCH_SIZE = 64

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=len(validation_dataset),
                                                    num_workers=0)
model = VNN(input_size=od.shape[1]-5,n_feature=64,output_size=2)
model = model.to(device)
criterion = nn.SmoothL1Loss()


In [7]:

optimizer = torch.optim.SGD(model.parameters(), lr=0.001) 

device = torch.device("mps")

train_losses = []

pbar = tqdm(range(N_EPOCHS))

validation_mae = 99999999
mean_train_loss = 999999
for i in pbar: 
    while validation_mae > 10 and mean_train_loss > 1:
        total_train_loss = 0.0
        total_train_mae = 0.0
        total_train_r2 = 0.0
        model.train()

        for batch in train_dataloader:
            y_pred,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)
            step(loss.float(), optimizer)
            total_train_loss += loss.item()
#             size = y_pred.shape[0]
#             mae = mean_absolute_error(y_pred.detach().cpu().numpy(),y_true.detach().cpu().numpy())
#             r2 = r2_score(y_pred.view(size,2).detach().cpu().numpy(),y_true.view(size,2).detach().cpu().numpy())
#             total_train_mae += mae
#             total_train_r2 += r2
        for batch in validation_dataloader:
            with torch.no_grad(): 
#                 print(batch[0].shape)
                y_pred,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)
#                 print(y_pred.shape)
                total_validation_loss = loss.item()
                size = y_pred.shape[0]
                validation_mae = mean_absolute_error(y_pred.view(size,2).detach().cpu().numpy()
                                                     ,y_true.view(size,2).detach().cpu().numpy())
                validation_r2 = r2_score(y_pred.view(size,2).detach().cpu().numpy(),y_true.view(size,2).detach().cpu().numpy())


        mean_train_loss = total_train_loss / len(train_dataloader)
        train_mae = total_train_mae / len(train_dataloader)
        train_r2 = total_train_r2 / len(train_dataloader)

        mean_validation_loss = total_validation_loss / len(validation_dataloader)

        pbar.set_description('train_loss:'+ str(round(mean_train_loss,4))+ \
                             ' validation_mae:'+ str(round(validation_mae,4)) +\
                            ' validation R2: '+ str(round(validation_r2,4)))

train_loss:104.2552 validation_mae:111.2601 validation R2: -110161888655.0063:   0%|                     | 0/5000 [00:06<?, ?it/s]


KeyboardInterrupt: 

In [446]:
for batch in validation_dataloader:
    with torch.no_grad(): 
        y_pred,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)

In [452]:
y_true.numpy()

array([[33., 11.],
       [35., 33.],
       [57.,  4.],
       ...,
       [22., 16.],
       [ 7.,  5.],
       [71., 31.]], dtype=float32)

In [453]:
y_true,y_pred = y_true.cpu().numpy(),y_pred.cpu().numpy()
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(masked_MAPE(y_true,y_pred))


0.9097482689371424
21.502193
66.24005
0.30688763313285594
