In [80]:
import os, sys
import warnings

warnings.filterwarnings("ignore")
from pathlib import Path

sys.path.append(str(Path.cwd().resolve().parent))

from preprocess import VN30, preprocess_v1
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import MultiTaskLassoCV, LassoCV, Lasso
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_percentage_error, root_mean_squared_error
from pandas import DataFrame

# Bước 1: Tiền xử lý dữ liệu

In [81]:
# Nếu sử dụng LassoCV thì đặt val = 0.0 do CV tự động chia tập train và valid
data = preprocess_v1('ACB', verbose=True)
X_train, y_train = data['train']
X_val, y_val = data['val']
X_test, y_test = data['test']
feature_sc = data['scaler']['feature']
target_sc = data['scaler']['target']

=== Preprocessing ACB ===
Train shape: (1213, 201), Val shape: (0, 201), Test shape: (296, 201)


In [82]:
# for i in range(Y_train.shape[1]):
# 	tscv = TimeSeriesSplit(n_splits=5)
# 	model = LassoCV(cv=tscv)
# 	model.fit(X_train, Y_train[:, i])

# Bước 2: Huấn luyện mô hình
* Lasso (lag = 30, 7 flags)
* Multi-task Lasso (lag = 30, 7 flags)
* Tổng cộng: 14 lần, trên 30 mã, trong mỗi mã chỉ lấy R-quared, in ra lasso.csv 

In [83]:
def train_lasso(X, y):
  tscv = TimeSeriesSplit()
  params = {
    "alpha": np.logspace(-10, 1, 5)
  }
  model = GridSearchCV(Lasso(), param_grid=params, cv=tscv)
  model.fit(X, y)
  return model

In [84]:
def train_multi_task_lasso(X, y):
  tscv = TimeSeriesSplit()
  model = MultiTaskLassoCV(cv=tscv)
  model.fit(X, y)
  return model

# Bước 3: Đánh giá mô hình

In [85]:
model = train_lasso(X_train, y_train)
model

In [86]:
print(model.best_params_)
print(r2_score(y_test, model.predict(X_test)))

{'alpha': np.float64(3.1622776601683795e-05)}
0.7779602624456816


# Đánh giá toàn bộ dữ liệu

In [87]:
flags_list = [
  "none",
  "calendar_feature",
  "rolling_feature",
  "technical_feature",
  "nonlinear_feature",
  "autocorr_feature",
  "trend_feature"
]
flags_dict = {
  flag: False for flag in flags_list if flag != "none"
}
for flag in flags_list:
  print(flag in flags_dict)

False
True
True
True
True
True
True


In [88]:
result = {flag: [] for flag in flags_list}
for code in VN30:
  flags_dict = {
    flag: False for flag in flags_list if flag != "none"
  }
  for flag in flags_list:
    if flag in flags_dict:
      flags_dict[flag] = True
    data = preprocess_v1(code, **flags_dict)
    X_train, y_train = data['train']
    X_val, y_val = data['val']
    X_test, y_test = data['test']
    feature_sc = data['scaler']['feature']
    target_sc = data['scaler']['target']
    model = train_lasso(X_train, y_train)
    y_test_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_test_pred)
    result[flag].append(r2)
    print(code, flags_dict, r2)

df = DataFrame(result, index=VN30)
df.to_csv("lasso.csv")

ACB {'calendar_feature': False, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7783650807582051
ACB {'calendar_feature': True, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7749642222753863
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7792199419544416
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7809269996372729
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': True, 'autocorr_feature': False, 'trend_feature': False} 0.7763989825789637
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlin

In [89]:
result = {flag: [] for flag in flags_list}
for code in VN30:
  flags_dict = {
    flag: False for flag in flags_list if flag != "none"
  }
  for flag in flags_list:
    if flag in flags_dict:
      flags_dict[flag] = True
    data = preprocess_v1(code, **flags_dict)
    X_train, y_train = data['train']
    X_val, y_val = data['val']
    X_test, y_test = data['test']
    feature_sc = data['scaler']['feature']
    target_sc = data['scaler']['target']
    model = train_multi_task_lasso(X_train, y_train)
    y_test_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_test_pred)
    result[flag].append(r2)
    print(code, flags_dict, r2)

df = DataFrame(result, index=VN30)
df.to_csv("multi-task-lasso.csv")

ACB {'calendar_feature': False, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7789541691669848
ACB {'calendar_feature': True, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.777107187304665
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.777429271358472
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.777426606865409
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': True, 'autocorr_feature': False, 'trend_feature': False} 0.7824689397860058
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear