In [2]:
%cd ..

/Users/haowu/Desktop/Transportation Research/Traffic-Prediction/model_v2


In [3]:
import numpy as np
import pickle
import os
import xgboost
from xgboost import XGBRegressor, plot_importance

from sklearn.linear_model import MultiTaskLasso, MultiTaskLassoCV, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

from tqdm import tqdm
from joblib import parallel_backend

from utils import seed_torch
from train import create_parser

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
os.cpu_count()

10

In [5]:
parser = create_parser()
# Be careful to use parser.parse_args([]) instead of parser.parse_args(). Otherwise it will prompt error. 
# The issue lies in JupyterNotebook
# See answer in https://stackoverflow.com/questions/50360012/python-argparse-error-error-argument-count-invalid-int-value for more details
args = parser.parse_args([])

In [6]:
# For reproducibility
seed_torch(args.seed)  # seed 912

In [7]:
np_train_in_5min = np.load(f"{args.data_dir}/np_train_in_5min.npy")  # (14760, 7, 1470)
np_train_out_5min = np.load(f"{args.data_dir}/np_train_out_5min.npy")  # (14760, 7, 207, 4)
np_val_in_5min = np.load(f"{args.data_dir}/np_val_in_5min.npy")  # (4320, 7, 1470)
np_val_out_5min = np.load(f"{args.data_dir}/np_val_out_5min.npy")  # (4320, 7, 207, 4)
np_test_in_5min = np.load(f"{args.data_dir}/np_test_in_5min.npy") # (1980, 7, 1470)
np_test_out_5min = np.load(f"{args.data_dir}/np_test_out_5min.npy") # (1980, 7, 207, 4)

In [8]:
np_train_out_5min = np_train_out_5min[:, 1:, :, 0]  # (14760, 6, 207)
np_val_out_5min = np_val_out_5min[:, 1:, :, 0]  # (4320, 6, 207)
np_test_out_5min = np_test_out_5min[:, 1:, :, 0] # (1980, 6, 207)

In [9]:
# merge training and validation dataset together to fit model
# We are using multitask-LASSO with cross-validation by scikit-learn. Therefore, there is no need for us to explicitly supply a training dataset and validation dataset. 
np_train_in_5min = np.concatenate([np_train_in_5min, np_val_in_5min], axis=0)  # (19080, 7, 1470)
np_train_out_5min = np.concatenate([np_train_out_5min, np_val_out_5min], axis=0) # (19080, 6, 207)

In [10]:
# Specify splitting index to denote training and validation group in np_train_in_5min and np_train_out_5min
split_index = [-1]*14760 + [0]*4320
pds = PredefinedSplit(test_fold = split_index)

In [11]:
# Convert dataset into (batch_size, seq_len_out * dim_out) to train Multitask-LASSO model
temp_train_in = np_train_in_5min.reshape(np_train_in_5min.shape[0], -1)
temp_test_in = np_test_in_5min.reshape(np_test_in_5min.shape[0], -1) 
temp_train_out = np_train_out_5min.reshape(np_train_out_5min.shape[0], -1) # (19080, 1242)
temp_test_out = np_test_out_5min.reshape(np_test_out_5min.shape[0], -1)  # (1980, 1242)

In [15]:
param_grid = {"gamma":[4, 6, 8]}
model = GridSearchCV(estimator=XGBRegressor(verbosity=1), 
                        cv = pds,
                        param_grid=param_grid, 
                        scoring="neg_mean_squared_error",
                        n_jobs=6,
                        verbose=10) # setting verbose = 2 or more gives more details than verbose = True
model.fit(temp_train_in, temp_train_out)

Fitting 1 folds for each of 3 candidates, totalling 3 fits
[CV 1/1; 3/3] START gamma=8.....................................................
[CV 1/1; 1/3] START gamma=4.....................................................
[CV 1/1; 2/3] START gamma=6.....................................................


In [15]:
pickle.dump(model, open("./xgboost.pkl", "wb"))

In [19]:
model.best_estimator_

In [16]:
pred_train = model.predict(temp_train_in)
pred_test = model.predict(temp_test_in)

In [32]:
mse_train = mean_squared_error(y_true = temp_train_out, y_pred=pred_train, multioutput="raw_values")  # (1242,)
mse_test = mean_squared_error(y_true = temp_test_out, y_pred=pred_test, multioutput="raw_values") # (1242,)
mape_train = mean_absolute_percentage_error(y_true = temp_train_out, y_pred=pred_train, multioutput="raw_values") # (1242,)
mape_test = mean_absolute_percentage_error(y_true = temp_test_out, y_pred=pred_test, multioutput="raw_values") # (1242,)

In [33]:
mse_all = (mse_train*temp_train_out.shape[0] + mse_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0]) # (1242,)
mape_all = (mape_train*temp_train_out.shape[0] + mape_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0]) # (1242,)

In [34]:
mse_all_by_out_step = mse_all.reshape(6, 207).mean(axis=1)
mape_all_by_out_step = mape_all.reshape(6, 207).mean(axis=1)

mse_test_by_out_step = mse_test.reshape(6, 207).mean(axis=1)
mape_test_by_out_step = mape_test.reshape(6, 207).mean(axis=1)

In [35]:
mse_all_by_out_step, mape_all_by_out_step

(array([35.147923, 37.451332, 38.79209 , 39.784584, 40.466938, 41.008625],
       dtype=float32),
 array([0.14301327, 0.14877287, 0.15203924, 0.15443698, 0.15598765,
        0.1570284 ], dtype=float32))

In [36]:
mse_test_by_out_step, mape_test_by_out_step

(array([41.646008, 44.06411 , 45.660816, 47.121468, 47.826637, 48.608685],
       dtype=float32),
 array([0.15540159, 0.16013004, 0.16331194, 0.16754444, 0.16880098,
        0.17060609], dtype=float32))

In [38]:
np.sqrt(mse_all_by_out_step), np.sqrt(mse_test_by_out_step)

(array([5.9285684, 6.1197495, 6.2283297, 6.3075023, 6.361363 , 6.4037976],
       dtype=float32),
 array([6.453372 , 6.63808  , 6.757279 , 6.8645077, 6.9156804, 6.971993 ],
       dtype=float32))