In [1]:
#%pip install pandas scikit-learn

### 1. Load data

In [2]:
import pandas as pd

train_df = pd.read_csv("../data/vn30/ACB_train.csv")
test_df = pd.read_csv("../data/vn30/ACB_test.csv")
print(train_df.shape)
print(train_df.head(10))
print(test_df.shape)
print(test_df.head(10))

(1245, 6)
         time  open  high   low  close   volume
0  2019-01-02  7.86  7.86  7.68   7.76  1103106
1  2019-01-03  6.99  7.73  6.99   7.47  1956382
2  2019-01-04  7.44  7.57  7.33   7.52  2771983
3  2019-01-07  7.68  7.68  7.57   7.63  1305819
4  2019-01-08  8.34  8.34  7.57   7.57   846226
5  2019-01-09  7.60  7.73  7.57   7.71  2196358
6  2019-01-10  7.71  7.73  7.68   7.68  1378687
7  2019-01-11  7.71  7.73  7.65   7.68  1250077
8  2019-01-14  7.65  7.68  7.63   7.63   937155
9  2019-01-15  7.49  7.73  7.04   7.73   901146
(328, 6)
         time   open   high    low  close    volume
0  2024-01-02  20.12  20.79  20.12  20.54  13896933
1  2024-01-03  20.58  21.00  20.37  21.00   9817807
2  2024-01-04  21.17  21.55  21.08  21.25  23605373
3  2024-01-05  21.25  21.38  21.04  21.38   9282598
4  2024-01-08  21.59  21.63  21.17  21.29  12398885
5  2024-01-09  21.21  21.29  20.96  21.00  15455964
6  2024-01-10  21.17  21.42  21.00  21.21  17610661
7  2024-01-11  21.34  21.63  21.13  2

### 2. Proces Data

In [3]:
def lag_transform_df(df: pd.DataFrame, lag: int):
    df_lag = df.copy()
    for col in df_lag.columns:
        for i in range(1, lag + 1):
            df_lag[f'{col}_lag_{i}'] = df_lag[col].shift(i)
    return df_lag.dropna()

df_train_lag = lag_transform_df(train_df.drop(["time"], axis=1), 5)
df_test_lag = lag_transform_df(test_df.drop(["time"], axis=1), 5)
print("train shape", df_train_lag.shape)
print(df_train_lag.head())
print("test shape", df_test_lag.shape)
print(df_test_lag.head())


train shape (1240, 30)
   open  high   low  close   volume  open_lag_1  open_lag_2  open_lag_3  \
5  7.60  7.73  7.57   7.71  2196358        8.34        7.68        7.44   
6  7.71  7.73  7.68   7.68  1378687        7.60        8.34        7.68   
7  7.71  7.73  7.65   7.68  1250077        7.71        7.60        8.34   
8  7.65  7.68  7.63   7.63   937155        7.71        7.71        7.60   
9  7.49  7.73  7.04   7.73   901146        7.65        7.71        7.71   

   open_lag_4  open_lag_5  ...  close_lag_1  close_lag_2  close_lag_3  \
5        6.99        7.86  ...         7.57         7.63         7.52   
6        7.44        6.99  ...         7.71         7.57         7.63   
7        7.68        7.44  ...         7.68         7.71         7.57   
8        8.34        7.68  ...         7.68         7.68         7.71   
9        7.60        8.34  ...         7.63         7.68         7.68   

   close_lag_4  close_lag_5  volume_lag_1  volume_lag_2  volume_lag_3  \
5         7.47

### 3. Train test split

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train, Y_train = df_train_lag.drop(["close"], axis=1), df_train_lag["close"]
X_test, Y_test = df_test_lag.drop(["close"], axis=1), df_test_lag["close"]
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


### 4. Train

In [17]:
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

l1_ratio = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
alphas = [0.01, 0.1, 1, 10, 100]
model = ElasticNetCV(cv=5, random_state=42, l1_ratio=l1_ratio, alphas=alphas, max_iter=10000)
model.fit(X_train_scale, Y_train)

print("Evaluate on Train Set")
X_train_predicted = model.predict(X_train_scale)
train_mse = mean_squared_error(Y_train, X_train_predicted)
print(f"Train Mean Squared Error: {train_mse}")

train_mape = mean_absolute_percentage_error(Y_train, X_train_predicted)
print(f"Train Mean Absolute Percentage Error: {train_mape}")

train_r2 = r2_score(Y_train, X_train_predicted)
print(f"Train R2 Score: {train_r2}")

print("--------------------------------")
print("Evaluate on Test Set")

X_test_predicted = model.predict(X_test_scale)
mse = mean_squared_error(Y_test, X_test_predicted)
print(f"Test Mean Squared Error: {mse}")

mape = mean_absolute_percentage_error(Y_test, X_test_predicted)
print(f"Test Mean Absolute Percentage Error: {mape}")

r2 = r2_score(Y_test, X_test_predicted)
print(f"Test R2 Score: {r2}")

Evaluate on Train Set
Train Mean Squared Error: 0.021700068714036794
Train Mean Absolute Percentage Error: 0.007523170472917357
Train R2 Score: 0.9990027553758026
--------------------------------
Evaluate on Test Set
Test Mean Squared Error: 0.028756893163902477
Test Mean Absolute Percentage Error: 0.005364546281526971
Test R2 Score: 0.9812220117576084


In [18]:
print("best alpha", model.alpha_)
print("best l1_ratio", model.l1_ratio_)


best alpha 0.01
best l1_ratio 1.0
