In [1]:
%cd ..

c:\Users\haowu\Desktop\Traffic Research\Traffic-Prediction\model_v2


In [2]:
import numpy as np
import pickle
import os

from sklearn.linear_model import MultiTaskLasso, MultiTaskLassoCV, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from tqdm import tqdm
from joblib import parallel_backend

from utils import seed_torch
from train import create_parser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.cpu_count()

16

In [4]:
parser = create_parser()
# Be careful to use parser.parse_args([]) instead of parser.parse_args(). Otherwise it will prompt error. 
# The issue lies in JupyterNotebook
# See answer in https://stackoverflow.com/questions/50360012/python-argparse-error-error-argument-count-invalid-int-value for more details
args = parser.parse_args([])

In [5]:
# For reproducibility
seed_torch(args.seed)  # seed 912

In [6]:
np_train_in_5min = np.load(f"{args.data_dir}/np_train_in_5min.npy")  # (14760, 7, 1470)
np_train_out_5min = np.load(f"{args.data_dir}/np_train_out_5min.npy")  # (14760, 7, 207, 4)
np_val_in_5min = np.load(f"{args.data_dir}/np_val_in_5min.npy")  # (4320, 7, 1470)
np_val_out_5min = np.load(f"{args.data_dir}/np_val_out_5min.npy")  # (4320, 7, 207, 4)
np_test_in_5min = np.load(f"{args.data_dir}/np_test_in_5min.npy") # (1980, 7, 1470)
np_test_out_5min = np.load(f"{args.data_dir}/np_test_out_5min.npy") # (1980, 7, 207, 4)

In [7]:
np_train_out_5min = np_train_out_5min[:, 1:, :, 0]  # (14760, 6, 207)
np_val_out_5min = np_val_out_5min[:, 1:, :, 0]  # (4320, 6, 207)
np_test_out_5min = np_test_out_5min[:, 1:, :, 0] # (1980, 6, 207)

In [8]:
# merge training and validation dataset together to fit model
# We are using multitask-LASSO with cross-validation by scikit-learn. Therefore, there is no need for us to explicitly supply a training dataset and validation dataset. 
np_train_in_5min = np.concatenate([np_train_in_5min, np_val_in_5min], axis=0)  # (19080, 7, 1470)
np_train_out_5min = np.concatenate([np_train_out_5min, np_val_out_5min], axis=0) # (19080, 6, 207)

In [9]:
mse_mat = np.zeros(args.seq_len_out)
mape_mat = np.zeros(args.seq_len_out)

In [10]:
temp_train_in = np_train_in_5min.reshape(np_train_in_5min.shape[0], -1)
temp_test_in = np_test_in_5min.reshape(np_test_in_5min.shape[0], -1)
temp_train_out = np_train_out_5min.reshape(np_train_out_5min.shape[0], -1)
temp_test_out = np_test_out_5min.reshape(np_test_out_5min.shape[0], -1)

In [11]:
model = MultiTaskLassoCV(cv=4, max_iter=500, n_alphas=10, random_state=args.seed, selection="random", tol=0.001, n_jobs=12, verbose=10) # setting verbose = 2 or more gives more details than verbose = True
model.fit(temp_train_in, temp_train_out)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.


(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., -0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 1026.411865234375, 1118426.875, 1)
(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., -0.,  0., ...,  0.,  0.,  0.],
       [-0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 11185.810546875, 1120871.5, 1)
(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., -0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 5888.0, 1119577.375, 1)
(array([[ 0.,  0.,  0., ...,

In [None]:
pickle.dump(model, open("./multitask_lasso.pkl", "wb"))

In [None]:
pred_train = model.predict(temp_train_in)
pred_test = model.predict(temp_test_in)

In [None]:
mse_train = mean_squared_error(y_true = temp_train_out, y_pred=pred_train)
mse_test = mean_squared_error(y_true = temp_test_out, y_pred=pred_test)
mape_train = mean_absolute_percentage_error(y_true = temp_train_out, y_pred=pred_train)
mape_test = mean_absolute_percentage_error(y_true = temp_test_out, y_pred=pred_test)

In [None]:
print(mse_train, mse_test, mape_train, mape_test)
print(f"mse over entire dataset = {(mse_train*temp_train_out.shape[0] + mse_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0])}")
print(f"mape over entire dataset = {(mape_train*temp_train_out.shape[0] + mape_test*temp_test_out.shape[0])/(temp_train_out.shape[0]+temp_test_out.shape[0])}")