In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df_train_full_raw = pd.read_csv('../data/vn30/GAS_train.csv')
df_test_raw = pd.read_csv('../data/vn30/GAS_test.csv')

df_train_full = df_train_full_raw.copy()
df_test = df_test_raw.copy()

target_columns = ['open', 'high', 'low', 'close', 'volume']


In [2]:
n_total_train = len(df_train_full)
n_train_split = int(n_total_train * 0.7)

df_train = df_train_full.iloc[:n_train_split].copy()
df_valid = df_train_full.iloc[n_train_split:].copy()


In [3]:
scaler = StandardScaler()

df_train[target_columns] = scaler.fit_transform(df_train[target_columns])
df_valid[target_columns] = scaler.transform(df_valid[target_columns])
df_test[target_columns] = scaler.transform(df_test[target_columns])

In [4]:
scaler_for_full_train = StandardScaler()
df_train_full_scaled = df_train_full.copy() 
df_train_full_scaled[target_columns] = scaler_for_full_train.fit_transform(df_train_full_scaled[target_columns])
df_test_for_final_eval = df_test_raw.copy() 
df_test_for_final_eval[target_columns] = scaler_for_full_train.transform(df_test_for_final_eval[target_columns])


In [5]:
def create_lagged_features(df, target_cols, n_lags=4):
    df_copy = df.copy()
    feature_names = []
    for col in target_cols:
        for i in range(1, n_lags + 1):
            lag_col_name = f'{col}_lag_{i}'
            df_copy[lag_col_name] = df_copy[col].shift(i)
            feature_names.append(lag_col_name)
    return df_copy, feature_names

n_lags = 4
df_train_featured, features_columns = create_lagged_features(df_train, target_columns, n_lags=n_lags)
df_valid_featured, _ = create_lagged_features(df_valid, target_columns, n_lags=n_lags) 
df_test_featured, _ = create_lagged_features(df_test_for_final_eval, target_columns, n_lags=n_lags)
df_train_full_featured, _ = create_lagged_features(df_train_full_scaled, target_columns, n_lags=n_lags)

df_train_featured.dropna(inplace=True)
df_valid_featured.dropna(inplace=True)
df_test_featured.dropna(inplace=True)
df_train_full_featured.dropna(inplace=True)

In [6]:
X_train = df_train_featured[features_columns]
Y_train = df_train_featured[target_columns]

X_valid = df_valid_featured[features_columns]
Y_valid = df_valid_featured[target_columns]

X_test = df_test_featured[features_columns]
Y_test = df_test_featured[target_columns] 

X_train_full = df_train_full_featured[features_columns]
Y_train_full = df_train_full_featured[target_columns]


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error

#Decision Tree
list_depth_dt = [i * 5 for i in range(1, 7)]
best_r2_score_dt = -float('inf')
best_depth_dt = None
best_model_dt_val = None 

for depth in list_depth_dt:
    model_dt = DecisionTreeRegressor(max_depth=depth, random_state=42)
    model_dt.fit(X_train, Y_train)
    Y_pred_valid_dt = model_dt.predict(X_valid)
    current_r2_score_dt = r2_score(Y_valid, Y_pred_valid_dt)
    print(f"DT Depth: {depth}, Validation R2 Score: {current_r2_score_dt}")
    if current_r2_score_dt > best_r2_score_dt:
        best_r2_score_dt = current_r2_score_dt
        best_depth_dt = depth
        best_model_dt_val = model_dt

print(f"Best DT Depth: {best_depth_dt}, Best Validation R2 Score: {best_r2_score_dt}")

DT Depth: 5, Validation R2 Score: 0.26538253618095753
DT Depth: 10, Validation R2 Score: 0.08295584882213883
DT Depth: 15, Validation R2 Score: 0.11154339410096252
DT Depth: 20, Validation R2 Score: 0.1346774941359266
DT Depth: 25, Validation R2 Score: 0.1346774941359266
DT Depth: 30, Validation R2 Score: 0.1346774941359266
Best DT Depth: 5, Best Validation R2 Score: 0.26538253618095753


In [8]:
import numpy as np
final_model_dt = DecisionTreeRegressor(max_depth=best_depth_dt, random_state=42)
# Retrain on full training data
final_model_dt.fit(X_train_full, Y_train_full)
    
Y_pred_test_dt_scaled = final_model_dt.predict(X_test)
    
r2_test_dt = r2_score(Y_test, Y_pred_test_dt_scaled) # Y_test is scaled
    
# Inverse transform for MAPE
Y_pred_test_dt_orig = scaler_for_full_train.inverse_transform(Y_pred_test_dt_scaled)
Y_test_orig = scaler_for_full_train.inverse_transform(Y_test)
try:
    mape_test_dt = mean_absolute_percentage_error(Y_test_orig, Y_pred_test_dt_orig)
except ValueError as e: 
    epsilon = 1e-8 # avoid division by zero
    mape_test_dt = np.mean(np.abs((Y_test_orig - Y_pred_test_dt_orig) / (np.abs(Y_test_orig) + epsilon)))

print(f"R2 Score: {r2_test_dt}")
print(f"MAPE: {mape_test_dt}")

R2 Score: 0.5992072764228455
MAPE: 0.13441535066950847


In [9]:
from sklearn.ensemble import RandomForestRegressor
#Random Forest
list_depth_rf = [i * 5 for i in range(1, 7)]
best_r2_score_rf = -float('inf')
best_depth_rf = None
best_model_rf_val = None

for depth in list_depth_rf:
    model_rf = RandomForestRegressor(max_depth=depth, n_estimators=100, random_state=42, n_jobs=-1)
    model_rf.fit(X_train, Y_train)
    Y_pred_valid_rf = model_rf.predict(X_valid)
    current_r2_score_rf = r2_score(Y_valid, Y_pred_valid_rf)
    print(f"RF Depth: {depth}, Validation R2 Score: {current_r2_score_rf}")
    if current_r2_score_rf > best_r2_score_rf:
        best_r2_score_rf = current_r2_score_rf
        best_depth_rf = depth
        best_model_rf_val = model_rf

print(f"Best RF Depth: {best_depth_rf}, Best Validation R2 Score: {best_r2_score_rf}")

RF Depth: 5, Validation R2 Score: 0.3990696665479205
RF Depth: 10, Validation R2 Score: 0.3421667262430993
RF Depth: 15, Validation R2 Score: 0.3203966636913713
RF Depth: 20, Validation R2 Score: 0.33860775526566
RF Depth: 25, Validation R2 Score: 0.33860775526566
RF Depth: 30, Validation R2 Score: 0.33860775526566006
Best RF Depth: 5, Best Validation R2 Score: 0.3990696665479205


In [10]:
final_model_rf = RandomForestRegressor(max_depth=best_depth_rf, n_estimators=100, random_state=42, n_jobs=-1)
# Retrain on full training data
final_model_rf.fit(X_train_full, Y_train_full)

Y_pred_test_rf_scaled = final_model_rf.predict(X_test)

r2_test_rf = r2_score(Y_test, Y_pred_test_rf_scaled)

# Inverse transform for MAPE
Y_pred_test_rf_orig = scaler_for_full_train.inverse_transform(Y_pred_test_rf_scaled)

try:
    mape_test_rf = mean_absolute_percentage_error(Y_test_orig, Y_pred_test_rf_orig)
except ValueError as e:
    epsilon = 1e-8
    mape_test_rf = np.mean(np.abs((Y_test_orig - Y_pred_test_rf_orig) / (np.abs(Y_test_orig) + epsilon)))
    
print(f"R2 Score: {r2_test_rf}")
print(f"MAPE: {mape_test_rf}")

R2 Score: 0.7494479782452395
MAPE: 0.10685065752598308
