In [17]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler

In [18]:
def read_train_and_test(train: str, test: str) -> tuple[DataFrame, DataFrame]:
  df_train = pd.read_csv(train)
  df_train.drop("time", axis=1, inplace=True)
  df_test = pd.read_csv(test)
  df_test.drop("time", axis=1, inplace=True)
  return (df_train, df_test)

In [19]:
def add_lag_features(df: DataFrame, cols: list[str], lag: int) -> list[str]:
  features: list[str] = []
  for i in range(1, lag + 1):
    for col in cols:
      s = f"{col}_lag_{i}"
      features.append(s)
      df[s] = df[col].shift(i)
  df.dropna(inplace=True)
  return features

In [20]:
def preprocess(df_train: DataFrame, df_test: DataFrame, lag: int) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
  if set(df_train.columns) != set(df_test.columns):
    raise ValueError("columns set in two DataFrame is not equivalent")
  df_train = df_train.copy()
  df_test = df_test.copy()
  targets = df_train.columns
  features = add_lag_features(df_train, targets, lag)
  add_lag_features(df_test, targets, lag)
  return (df_train[features], df_train[targets], df_test[features], df_test[targets])

In [21]:
from sklearn.metrics import mean_absolute_percentage_error, root_mean_squared_error, r2_score

def print_result(y_train, y_train_pred, y_test, y_test_pred):
  print("Train R2-squared:", r2_score(y_train, y_train_pred))
  print("Test R2-square:", r2_score(y_test, y_test_pred))
  print("Train MAPE:", mean_absolute_percentage_error(y_train, y_train_pred))
  print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))
  print("Train RMSE:", root_mean_squared_error(y_train, y_train_pred))
  print("Test RMSE:", root_mean_squared_error(y_test, y_test_pred))

In [22]:
def regression(*,
  train_path: str,
  test_path: str,
  lag: int,
  reg
):
  df_train, df_test = read_train_and_test(
  train=train_path,
  test=test_path,
  )

  X_train, y_train, X_test, y_test = preprocess(df_train, df_test, lag)

  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  y_test_pred = reg.predict(X_test)
  print_result(y_train, y_train_pred, y_test, y_test_pred)

In [23]:
from sklearn.linear_model import RidgeCV

path = "../data/vn30/{}.csv"
train_path=path.format("ACB_train")
test_path=path.format("ACB_test")

In [24]:
reg = RidgeCV(alphas=np.logspace(-10, 10, 50))

regression(train_path=train_path, test_path=test_path, lag=3, reg=reg)

Train R2-squared: 0.8743932483869108
Test R2-square: 0.8035795367879157
Train MAPE: 0.09680869558355906
Test MAPE: 0.10155832686548867
Train RMSE: 695297.9713615039
Test RMSE: 1234535.1280469391


In [25]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

params = {
  "alpha": np.logspace(-4, 1, 10)
}

reg = GridSearchCV(Lasso(max_iter=200000), param_grid=params)
regression(train_path=train_path, test_path=test_path, lag=3, reg=reg)

Train R2-squared: 0.8751950219861444
Test R2-square: 0.8081804393287431
Train MAPE: 0.0963741552958511
Test MAPE: 0.10253191227936298
Train RMSE: 693594.6356148357
Test RMSE: 1225304.6993951239
