In [1]:
%pip install pandas scikit-learn pyarrow lightgbm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb


In [3]:
df = pd.read_feather('Wide_S_P_100_format.feather')
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

print(df.shape)
print(df.columns[:5])
df.head()


(15928, 101)
Index(['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE'], dtype='object', name='Ticker')


Ticker,AAPL,ABBV,ABT,ACN,ADBE,AIG,AMD,AMGN,AMT,AMZN,...,TXN,UNH,UNP,UPS,USB,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,0.014852
1962-01-04,,,,,,,,,,,...,,,,,,,,,,0.002439
1962-01-05,,,,,,,,,,,...,,,,,,,,,,-0.021899
1962-01-08,,,,,,,,,,,...,,,,,,,,,,-0.002488


In [4]:
df.tail()

Ticker,AAPL,ABBV,ABT,ACN,ADBE,AIG,AMD,AMGN,AMT,AMZN,...,TXN,UNH,UNP,UPS,USB,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-04-08,-0.049818,-0.058322,-0.007288,-0.011696,-0.002055,0.001418,-0.064921,-0.032552,-0.040911,-0.026247,...,-0.051883,0.054088,-0.008096,-0.038091,-0.01087,-0.012968,-0.009629,-0.001448,-0.024335,-0.02108
2025-04-09,0.153289,0.023738,0.024929,0.065923,0.072471,0.066306,0.238205,0.038643,0.013779,0.11977,...,0.160879,0.045961,0.07327,0.084204,0.07967,0.078373,0.017074,0.06846,0.095488,0.049916
2025-04-10,-0.042394,-0.031361,-0.019994,-0.049076,-0.040012,-0.017749,-0.084056,-0.033254,0.004049,-0.051701,...,-0.076106,0.027485,-0.018834,-0.031106,-0.052163,-0.023524,0.016772,-0.048545,0.011272,-0.055482
2025-04-11,0.040594,0.004879,0.019116,-0.003085,0.006913,0.007621,0.052988,0.01624,0.031943,0.020141,...,-0.057471,0.00853,-0.003329,0.005385,0.016376,0.027079,0.018872,-0.009507,0.02417,0.032122
2025-04-14,0.022054,0.023136,0.008512,0.019167,-0.004426,-0.005734,0.011777,0.027764,0.021433,-0.014875,...,0.0229,-0.020702,0.016835,0.016378,0.00898,0.005339,0.013035,0.009438,0.020797,0.002424


In [5]:
n_lags = 60
# Step 1: split original series first
df_train = df[(df.index.year >= 2000) & (df.index.year <= 2012)]
df_calib = df[(df.index.year >= 2013) & (df.index.year <= 2019)]
df_test = df[(df.index.year >= 2020) & (df.index.year <= 2025)]

# Step 2: for each set, generate lagged data individually
def make_lagged_data(df_part, n_lags):
    full_rows = []
    for ticker in df_part.columns:
        series = df_part[ticker]
        lagged = pd.concat({f'lag_{i}': series.shift(i) for i in range(1, n_lags+1)}, axis=1)
        lagged['target'] = series
        lagged['ticker'] = ticker
        lagged['Date'] = series.index
        full_rows.append(lagged)
    return pd.concat(full_rows).dropna().reset_index(drop=True)

train_data = make_lagged_data(df_train, n_lags)
calib_data = make_lagged_data(df_calib, n_lags)
test_data = make_lagged_data(df_test, n_lags)

feature_cols = [f'lag_{i}' for i in range(1, n_lags+1)]

X_train = train_data[feature_cols]
y_train = train_data['target']

X_calib = calib_data[feature_cols]
y_calib = calib_data['target']

X_test = test_data[feature_cols]
y_test = test_data['target']

print (X_train.shape, X_calib.shape, X_test.shape)
print(X_train.head())
print(X_train.tail())
print(X_calib.head())
print(X_calib.tail())
print(X_test.head())
print(X_test.tail())

(285365, 60) (169568, 60) (127879, 60)
      lag_1     lag_2     lag_3     lag_4     lag_5     lag_6     lag_7  \
0 -0.003135  0.006310 -0.018577 -0.019938  0.068550  0.097053 -0.016000   
1 -0.022912 -0.003135  0.006310 -0.018577 -0.019938  0.068550  0.097053   
2 -0.074943 -0.022912 -0.003135  0.006310 -0.018577 -0.019938  0.068550   
3  0.080020 -0.074943 -0.022912 -0.003135  0.006310 -0.018577 -0.019938   
4 -0.018408  0.080020 -0.074943 -0.022912 -0.003135  0.006310 -0.018577   

      lag_8     lag_9    lag_10  ...    lag_51    lag_52    lag_53    lag_54  \
0  0.028278  0.045699  0.017505  ...  0.038114  0.109677 -0.059973 -0.051151   
1 -0.016000  0.028278  0.045699  ...  0.034848  0.038114  0.109677 -0.059973   
2  0.097053 -0.016000  0.028278  ...  0.025256  0.034848  0.038114  0.109677   
3  0.068550  0.097053 -0.016000  ...  0.065103  0.025256  0.034848  0.038114   
4 -0.019938  0.068550  0.097053  ... -0.019273  0.065103  0.025256  0.034848   

     lag_55    lag_56    lag_

In [6]:

param_grid = {
    "n_estimators":       [300, 400, 500],     # headroom for early-stop
    "learning_rate":      [0.01, 0.02, 0.03],
    "num_leaves":         [31, 63, 127, 255],  # primary depth control
    "max_depth":          [-1],                # unlimited (let num_leaves rule)
    "min_child_samples":  [20, 50, 100],
    "feature_fraction":   [0.7, 0.85, 1.0],    # ≈ colsample_bytree
    "bagging_fraction":   [0.7, 0.85, 1.0],    # ≈ subsample
    "bagging_freq":       [1],                 # activates bagging_fraction
    "reg_alpha":          [0.0, 0.1],
    "reg_lambda":         [1, 5, 10],
}

base_lgb = lgb.LGBMRegressor(
    objective="regression",
    device_type="gpu",
    gpu_use_dp=False,
    random_state=42,
    verbose=-1,
    n_jobs=1,
)

random_search = RandomizedSearchCV(
    estimator=base_lgb,
    param_distributions=param_grid,
    n_iter=200,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3,
    random_state=42,
    n_jobs=1,                 
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best hyperparameters found:")
print(random_search.best_params_)





Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END bagging_fraction=0.85, bagging_freq=1, feature_fraction=0.7, learning_rate=0.03, max_depth=-1, min_child_samples=100, n_estimators=400, num_leaves=31, reg_alpha=0.1, reg_lambda=10;, score=-0.001 total time=   9.2s
[CV 2/5] END bagging_fraction=0.85, bagging_freq=1, feature_fraction=0.7, learning_rate=0.03, max_depth=-1, min_child_samples=100, n_estimators=400, num_leaves=31, reg_alpha=0.1, reg_lambda=10;, score=-0.000 total time=   8.5s
[CV 3/5] END bagging_fraction=0.85, bagging_freq=1, feature_fraction=0.7, learning_rate=0.03, max_depth=-1, min_child_samples=100, n_estimators=400, num_leaves=31, reg_alpha=0.1, reg_lambda=10;, score=-0.001 total time=   8.5s
[CV 4/5] END bagging_fraction=0.85, bagging_freq=1, feature_fraction=0.7, learning_rate=0.03, max_depth=-1, min_child_samples=100, n_estimators=400, num_leaves=31, reg_alpha=0.1, reg_lambda=10;, score=-0.001 total time=   8.6s
[CV 5/5] END bagging_fractio

In [7]:
# Normal point prediction model (MSE)
gbr = lgb.LGBMRegressor(
    objective='regression',
    random_state=42,
    **best_params
)
gbr.fit(X_train, y_train)

# Lower quantile model (5%)
gbr_lower = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.05,
    random_state=42,
    **best_params
)
gbr_lower.fit(X_train, y_train)

# Upper quantile model (95%)
gbr_upper = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.95,
    random_state=42,
    **best_params
)
gbr_upper.fit(X_train, y_train)



In [None]:
pred_calib = gbr.predict(X_calib)
pred_test = gbr.predict(X_test)

pred_lower_calib = gbr_lower.predict(X_calib)
pred_upper_calib = gbr_upper.predict(X_calib)

pred_lower_test = gbr_lower.predict(X_test)
pred_upper_test = gbr_upper.predict(X_test)


In [15]:
# Extract aligned metadata
dates_calib = calib_data["Date"].reset_index(drop=True)
dates_test = test_data["Date"].reset_index(drop=True)
tickers_calib = calib_data["ticker"].reset_index(drop=True)
tickers_test = test_data["ticker"].reset_index(drop=True)

# Save point predictions (calibration)
df_pred_calib = pd.DataFrame({
    'Date': dates_calib,
    'Ticker': tickers_calib,
    'pred_calib': pred_calib,
    'y_calib': y_calib.reset_index(drop=True)
})
df_pred_calib.to_csv('pred_calib.csv', index=False)

# Save point predictions (test)
df_pred_test = pd.DataFrame({
    'Date': dates_test,
    'Ticker': tickers_test,
    'pred_test': pred_test,
    'y_test': y_test.reset_index(drop=True)
})
df_pred_test.to_csv('pred_test.csv', index=False)

# Save quantile predictions (calibration)
df_quantile_calib = pd.DataFrame({
    'Date': dates_calib,
    'Ticker': tickers_calib,
    'pred_lower_calib': pred_lower_calib,
    'pred_upper_calib': pred_upper_calib,
    'y_calib': y_calib.reset_index(drop=True)
})
df_quantile_calib.to_csv('quantile_calib.csv', index=False)

# Save quantile predictions (test)
df_quantile_test = pd.DataFrame({
    'Date': dates_test,
    'Ticker': tickers_test,
    'pred_lower_test': pred_lower_test,
    'pred_upper_test': pred_upper_test,
    'y_test': y_test.reset_index(drop=True)
})
df_quantile_test.to_csv('quantile_test.csv', index=False)
