In [1]:
%cd ..

c:\Users\haowu\Desktop\Traffic Research\Traffic-Prediction\model_v2


In [2]:
import numpy as np
import pickle
import os
import xgboost
from xgboost import XGBRegressor, plot_importance

from sklearn.linear_model import MultiTaskLasso, MultiTaskLassoCV, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

from tqdm import tqdm
from joblib import parallel_backend

from utils import seed_torch
from train import create_parser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.cpu_count()

16

In [4]:
parser = create_parser()
# Be careful to use parser.parse_args([]) instead of parser.parse_args(). Otherwise it will prompt error. 
# The issue lies in JupyterNotebook
# See answer in https://stackoverflow.com/questions/50360012/python-argparse-error-error-argument-count-invalid-int-value for more details
args = parser.parse_args([])

In [5]:
# For reproducibility
seed_torch(args.seed)  # seed 912

In [6]:
np_train_in_5min = np.load(f"{args.data_dir}/np_train_in_5min.npy")  # (14760, 7, 1470)
np_train_out_5min = np.load(f"{args.data_dir}/np_train_out_5min.npy")  # (14760, 7, 207, 4)
np_val_in_5min = np.load(f"{args.data_dir}/np_val_in_5min.npy")  # (4320, 7, 1470)
np_val_out_5min = np.load(f"{args.data_dir}/np_val_out_5min.npy")  # (4320, 7, 207, 4)
np_test_in_5min = np.load(f"{args.data_dir}/np_test_in_5min.npy") # (1980, 7, 1470)
np_test_out_5min = np.load(f"{args.data_dir}/np_test_out_5min.npy") # (1980, 7, 207, 4)

In [7]:
np_train_out_5min = np_train_out_5min[:, 1:, :, 0]  # (14760, 6, 207)
np_val_out_5min = np_val_out_5min[:, 1:, :, 0]  # (4320, 6, 207)
np_test_out_5min = np_test_out_5min[:, 1:, :, 0] # (1980, 6, 207)

In [8]:
# merge training and validation dataset together to fit model
# We are using multitask-LASSO with cross-validation by scikit-learn. Therefore, there is no need for us to explicitly supply a training dataset and validation dataset. 
np_train_in_5min = np.concatenate([np_train_in_5min, np_val_in_5min], axis=0)  # (19080, 7, 1470)
np_train_out_5min = np.concatenate([np_train_out_5min, np_val_out_5min], axis=0) # (19080, 6, 207)

In [9]:
# Specify splitting index to denote training and validation group in np_train_in_5min and np_train_out_5min
split_index = [-1]*14760 + [0]*4320
pds = PredefinedSplit(test_fold = split_index)

In [10]:
# Convert dataset into (batch_size, seq_len_out * dim_out) to train Multitask-LASSO model
temp_train_in = np_train_in_5min.reshape(np_train_in_5min.shape[0], -1)
temp_test_in = np_test_in_5min.reshape(np_test_in_5min.shape[0], -1) 
temp_train_out = np_train_out_5min.reshape(np_train_out_5min.shape[0], -1) # (19080, 1242)
temp_test_out = np_test_out_5min.reshape(np_test_out_5min.shape[0], -1)  # (1980, 1242)

In [11]:
# param_grid = {"gamma":[4, 6, 8]}
# model = GridSearchCV(estimator=XGBRegressor(verbosity=1, tree_method='gpu_hist', gpu_id=0), 
#                         cv = pds,
#                         param_grid=param_grid, 
#                         scoring="neg_mean_squared_error",
#                         n_jobs=3,
#                         verbose=10) # setting verbose = 2 or more gives more details than verbose = True
model=XGBRegressor(verbosity = 1,  tree_method='gpu_hist', gpu_id=0)
model.fit(temp_train_in, temp_train_out)

In [12]:
pickle.dump(model, open("./xgboost.pkl", "wb"))

In [19]:
model.best_estimator_

In [13]:
pred_train = model.predict(temp_train_in)
pred_test = model.predict(temp_test_in)

In [14]:
mse_train = mean_squared_error(y_true = temp_train_out, y_pred=pred_train, multioutput="raw_values")  # (1242,)
mse_test = mean_squared_error(y_true = temp_test_out, y_pred=pred_test, multioutput="raw_values") # (1242,)
mape_train = mean_absolute_percentage_error(y_true = temp_train_out, y_pred=pred_train, multioutput="raw_values") # (1242,)
mape_test = mean_absolute_percentage_error(y_true = temp_test_out, y_pred=pred_test, multioutput="raw_values") # (1242,)

In [15]:
mse_all = (mse_train*temp_train_out.shape[0] + mse_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0]) # (1242,)
mape_all = (mape_train*temp_train_out.shape[0] + mape_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0]) # (1242,)

In [16]:
mse_all_by_out_step = mse_all.reshape(6, 207).mean(axis=1)
mape_all_by_out_step = mape_all.reshape(6, 207).mean(axis=1)

mse_test_by_out_step = mse_test.reshape(6, 207).mean(axis=1)
mape_test_by_out_step = mape_test.reshape(6, 207).mean(axis=1)

In [17]:
mse_all_by_out_step, mape_all_by_out_step

(array([10.823448, 11.68604 , 12.089598, 12.336078, 12.414796, 12.550133],
       dtype=float32),
 array([0.07220785, 0.07577683, 0.07759254, 0.07850988, 0.07881037,
        0.0792876 ], dtype=float32))

In [18]:
mse_test_by_out_step, mape_test_by_out_step

(array([41.867584, 44.416412, 45.546696, 46.28028 , 46.626812, 47.071705],
       dtype=float32),
 array([0.15061851, 0.15631141, 0.15885015, 0.16136487, 0.16192892,
        0.16307214], dtype=float32))

In [19]:
np.sqrt(mse_all_by_out_step), np.sqrt(mse_test_by_out_step)

(array([3.289901 , 3.418485 , 3.4770098, 3.5122752, 3.5234637, 3.5426166],
       dtype=float32),
 array([6.4705167, 6.664564 , 6.7488294, 6.8029613, 6.828383 , 6.8608823],
       dtype=float32))

In [20]:
mse_train.reshape(6, 207).mean(axis=1)

array([7.6018867, 8.289491 , 8.617635 , 8.813565 , 8.864493 , 8.967704 ],
      dtype=float32)