In [151]:

import os, sys
from pathlib import Path

sys.path.append(str(Path.cwd().resolve().parent))
from preprocess import VN30, preprocess_v1
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_absolute_percentage_error, root_mean_squared_error
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt

# Bước 1: Tiền xử lý dữ liệu

In [152]:
# Nếu sử dụng RidgeCV thì đặt val = 0.0 do CV tự động chia tập train và valid
data = preprocess_v1(
  'ACB',
  lag=30,
  val=0.0,
  calendar_feature=False,
  rolling_feature=False,
  technical_feature=False,
  nonlinear_feature=False,
  autocorr_feature=False,
  trend_feature=False,
  verbose=True)

=== Preprocessing ACB ===
Train shape: (1215, 150), Val shape: (0, 150), Test shape: (298, 150)


In [153]:
X_train, y_train = data['train']
X_val, y_val = data['val']
X_test, y_test = data['test']
feature_sc = data['scaler']['feature']
target_sc = data['scaler']['target']

In [154]:
# Tất cả mã chứng khoán đều nằm trong VN30
for code in VN30:
	print(code)

ACB
BCM
BID
BVH
CTG
FPT
GAS
GVR
HDB
HPG
LPB
MBB
MSN
MWG
PLX
SAB
SHB
SSB
SSI
STB
TCB
TPB
VCB
VHM
VIB
VIC
VJC
VNM
VPB
VRE


# Bước 2: Huấn luyện mô hình

* RidgeCV (lag = 30, 7 flags)


In [155]:
def train(X, y):
  tscv = TimeSeriesSplit()
  model = RidgeCV(cv=tscv)
  model.fit(X, y)
  return model

# Bước 3: Đánh giá mô hình

In [156]:
def print_result(y_train, y_train_pred, y_test, y_test_pred):
  print("Train R2-squared:", r2_score(y_train, y_train_pred))
  print("Test R2-square:", r2_score(y_test, y_test_pred))
  print("Train MAPE:", mean_absolute_percentage_error(target_sc.inverse_transform(y_train), target_sc.inverse_transform(y_train_pred)))
  print("Test MAPE:", mean_absolute_percentage_error(target_sc.inverse_transform(y_test), target_sc.inverse_transform(y_test_pred)))
  print("Train RMSE:", root_mean_squared_error(y_train, y_train_pred))
  print("Test RMSE:", root_mean_squared_error(y_test, y_test_pred))

In [157]:
model = train(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [158]:
y_train[0], y_train_pred[0]

(array([-1.28930409, -1.30488906, -1.27205656, -1.28478442, -0.82244553]),
 array([-1.28166938, -1.28598609, -1.26211533, -1.26150403, -0.69633272]))

In [159]:
print_result(y_train, y_train_pred, y_test, y_test_pred)

Train R2-squared: 0.907711303638966
Test R2-square: 0.7815098230398043
Train MAPE: 0.09540486281618658
Test MAPE: 0.1041862913058685
Train RMSE: 0.16779161986063465
Test RMSE: 0.33362675038133466


In [163]:
flags_list = [
  "none",
  "calendar_feature",
  "rolling_feature",
  "technical_feature",
  "nonlinear_feature",
  "autocorr_feature",
  "trend_feature"
]
flags_dict = {
  flag: False for flag in flags_list if flag != "none"
}
# flags_dict[""] = None
# tmp = preprocess_v1("ABC", **flags_dict)
# tmp
for flag in flags_list:
  print(flag in flags_dict)

False
True
True
True
True
True
True


# Đánh giá toàn bộ dữ liệu

In [165]:
result = {flag: [] for flag in flags_list}
for code in VN30:
  flags_dict = {
    flag: False for flag in flags_list if flag != "none"
  }
  for flag in flags_list:
    if flag in flags_dict:
      flags_dict[flag] = True
    data = preprocess_v1(code, **flags_dict)
    X_train, y_train = data['train']
    X_val, y_val = data['val']
    X_test, y_test = data['test']
    feature_sc = data['scaler']['feature']
    target_sc = data['scaler']['target']
    model = train(X_train, y_train)
    y_test_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_test_pred)
    result[flag].append(r2)
    print(code, flags_dict, r2)

df = DataFrame(result, index=VN30)
df.to_csv("ridge.csv")

ACB {'calendar_feature': False, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7815098230398043
ACB {'calendar_feature': True, 'rolling_feature': False, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7778807169363746
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': False, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7810307219059842
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': False, 'autocorr_feature': False, 'trend_feature': False} 0.7638314171692717
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlinear_feature': True, 'autocorr_feature': False, 'trend_feature': False} 0.7804223903216215
ACB {'calendar_feature': True, 'rolling_feature': True, 'technical_feature': True, 'nonlin