# 04 Model Training

1. A baseline forecast (persistence)

2. A Ridge regression model (strong first ML model)

3. Evaluation metrics (MAE, RMSE)

4. Saved predictions for Notebook 05 backtesting/evaluation

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
train = pd.read_csv("../data_processed/train_regression.csv", index_col=0, parse_dates=True)
test  = pd.read_csv("../data_processed/test_regression.csv", index_col=0, parse_dates=True)

train.shape, test.shape


((3443, 17), (497, 17))

### 1. Define the features

In [4]:
target = "y_log"
exclude = ["y", "y_log"]
feature_cols = [col for col in train.columns if col not in exclude]

X_train = train[feature_cols]
y_train = train[target]

X_test = test[feature_cols]
y_test = test[target]

len(feature_cols), feature_cols


(15,
 ['realized_vol',
  'rv_lag1',
  'rv_lag5',
  'rv_lag20',
  'rv_change1',
  'rv_change5',
  'ret_lag1',
  'ret_lag5',
  'abs_ret',
  'abs_ret_mean5',
  'abs_ret_mean20',
  'vol_of_vol_20',
  'vol_of_vol_60',
  'drawdown_60',
  'ma_ratio_20_60'])

### 2. Baseline model
Since we assume volatility is persistent in base line so
- predict future volatility equals current realized volatility.

In [5]:
baseline_pred_log = np.log(test["realized_vol"])
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

baseline_mae = mean_absolute_error(y_test, baseline_pred_log)
baseline_rmse = rmse(y_test, baseline_pred_log)

baseline_mae, baseline_rmse

(0.14834675033226374, np.float64(0.21069399115704218))

### 3. Train Ridge regression
- we use ridge regression to prevent overfitting. 
- it shrink the coefficient towards 0 but add more bias

In [6]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

model.fit(X_train, y_train)

ml_pred_log = model.predict(X_test)

ml_mae = mean_absolute_error(y_test, ml_pred_log)
ml_rmse = rmse(y_test, ml_pred_log)

ml_mae, ml_rmse

(0.17768145917190162, np.float64(0.24073461202758656))

### 4. Comparing the baseline vs the model
- model should outperform the baseline

In [7]:
results = pd.DataFrame({
    "model": ["baseline_persistence", "ridge_regression"],
    "MAE(log-vol)": [baseline_mae, ml_mae],
    "RMSE(log-vol)": [baseline_rmse, ml_rmse]
})

results

Unnamed: 0,model,MAE(log-vol),RMSE(log-vol)
0,baseline_persistence,0.148347,0.210694
1,ridge_regression,0.177681,0.240735


### 5. Tune Ridge alpha
- now the baseline outperforms the model so we must tune the model

In [8]:
alphas = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]
rows = []

for a in alphas:
    m = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=a))
    ])
    m.fit(X_train, y_train)
    pred = m.predict(X_test)
    rows.append({
        "alpha": a,
        "MAE(log-vol)": mean_absolute_error(y_test, pred),
        "RMSE(log-vol)": rmse(y_test, pred)
    })

pd.DataFrame(rows).sort_values("MAE(log-vol)")

Unnamed: 0,alpha,MAE(log-vol),RMSE(log-vol)
6,1000.0,0.166821,0.21339
5,100.0,0.173333,0.22603
4,10.0,0.175765,0.236306
3,1.0,0.177681,0.240735
2,0.1,0.177936,0.241334
1,0.01,0.177962,0.241396
0,0.001,0.177965,0.241402
