In [1]:
%pip install pandas scikit-learn pyarrow lightgbm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb


In [None]:
df = pd.read_feather('data\Financial Timeseries\Wide_S_P_100_format.feather')
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

print(df.shape)
print(df.columns[:5])
df.head()


(15928, 101)
Index(['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE'], dtype='object', name='Ticker')


Ticker,AAPL,ABBV,ABT,ACN,ADBE,AIG,AMD,AMGN,AMT,AMZN,...,TXN,UNH,UNP,UPS,USB,V,VZ,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,0.014852
1962-01-04,,,,,,,,,,,...,,,,,,,,,,0.002439
1962-01-05,,,,,,,,,,,...,,,,,,,,,,-0.021899
1962-01-08,,,,,,,,,,,...,,,,,,,,,,-0.002488


In [None]:
n_lags = 240

full_rows = []

for ticker in df.columns:
    series = df[ticker]
    lagged = pd.concat({f'lag_{i}': series.shift(i) for i in range(1, n_lags+1)}, axis=1)
    lagged['target'] = series
    lagged['ticker'] = ticker
    lagged['Date'] = series.index
    full_rows.append(lagged)

full_data = pd.concat(full_rows).dropna().reset_index(drop=True)

print(full_data.shape)
full_data.head()


(1029966, 63)


Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,...,lag_54,lag_55,lag_56,lag_57,lag_58,lag_59,lag_60,target,ticker,Date
0,-0.038886,-0.047626,-0.078044,-0.009661,-0.004807,-0.009532,-0.014083,0.004716,0.034153,0.014851,...,0.042199,0.04867,0.061029,0.028992,0.024751,-0.073398,-0.05217,0.04046,AAPL,1981-03-12
1,0.04046,-0.038886,-0.047626,-0.078044,-0.009661,-0.004807,-0.009532,-0.014083,0.004716,0.034153,...,0.052628,0.042199,0.04867,0.061029,0.028992,0.024751,-0.073398,-0.01111,AAPL,1981-03-13
2,-0.01111,0.04046,-0.038886,-0.047626,-0.078044,-0.009661,-0.004807,-0.009532,-0.014083,0.004716,...,0.092309,0.052628,0.042199,0.04867,0.061029,0.028992,0.024751,0.039334,AAPL,1981-03-16
3,0.039334,-0.01111,0.04046,-0.038886,-0.047626,-0.078044,-0.009661,-0.004807,-0.009532,-0.014083,...,0.014083,0.092309,0.052628,0.042199,0.04867,0.061029,0.028992,0.048646,AAPL,1981-03-17
4,0.048646,0.039334,-0.01111,0.04046,-0.038886,-0.047626,-0.078044,-0.009661,-0.004807,-0.009532,...,-0.024303,0.014083,0.092309,0.052628,0.042199,0.04867,0.061029,0.061852,AAPL,1981-03-18


In [5]:
train_data = full_data[(full_data['Date'].dt.year >= 2000) & (full_data['Date'].dt.year <= 2012)]
calib_data = full_data[(full_data['Date'].dt.year >= 2013) & (full_data['Date'].dt.year <= 2019)]
test_data = full_data[(full_data['Date'].dt.year >= 2020) & (full_data['Date'].dt.year <= 2025)]

print('Train:', train_data.shape)
print('Calibration:', calib_data.shape)
print('Test:', test_data.shape)


Train: (290140, 63)
Calibration: (175448, 63)
Test: (133879, 63)


In [6]:
feature_cols = [f'lag_{i}' for i in range(1, n_lags+1)]

X_train = train_data[feature_cols]
y_train = train_data['target']

X_calib = calib_data[feature_cols]
y_calib = calib_data['target']

X_test = test_data[feature_cols]
y_test = test_data['target']


In [None]:
param_grid = {
    'n_estimators': [300, 400, 500],          # Small to moderate trees
    'learning_rate': [0.02, 0.03, 0.05],       
    'max_depth': [5, 6, 7, 8, 10, 12, 14, 15, 20],             
    'min_child_weight': [1, 3, 5],             
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1],                   
    'reg_lambda': [1, 5],                     
}


base_lgb = lgb.LGBMRegressor(objective='regression', random_state=42, verbose = -1, n_jobs = 1)

random_search = RandomizedSearchCV(
    estimator=base_lgb,
    param_distributions=param_grid,
    n_iter=200,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best hyperparameters found:")
print(best_params)



Fitting 5 folds for each of 200 candidates, totalling 1000 fits


KeyboardInterrupt: 

In [None]:
# Normal point prediction model (MSE)
gbr = lgb.LGBMRegressor(
    objective='regression',
    random_state=42,
    **best_params
)
gbr.fit(X_train, y_train)

# Lower quantile model (5%)
gbr_lower = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.05,
    random_state=42,
    **best_params
)
gbr_lower.fit(X_train, y_train)

# Upper quantile model (95%)
gbr_upper = lgb.LGBMRegressor(
    objective='quantile',
    alpha=0.95,
    random_state=42,
    **best_params
)
gbr_upper.fit(X_train, y_train)



In [None]:
pred_calib = gbr.predict(X_calib)
pred_test = gbr.predict(X_test)

pred_lower_calib = gbr_lower.predict(X_calib)
pred_upper_calib = gbr_upper.predict(X_calib)

pred_lower_test = gbr_lower.predict(X_test)
pred_upper_test = gbr_upper.predict(X_test)


In [None]:
df_pred_calib = pd.DataFrame({
    'pred_calib': pred_calib,
    'y_calib': y_calib.values
})
df_pred_calib.to_csv('pred_calib.csv', index=False)

df_pred_test = pd.DataFrame({
    'pred_test': pred_test,
    'y_test': y_test.values
})
df_pred_test.to_csv('pred_test.csv', index=False)

df_quantile_calib = pd.DataFrame({
    'pred_lower_calib': pred_lower_calib,
    'pred_upper_calib': pred_upper_calib,
    'y_calib': y_calib.values
})
df_quantile_calib.to_csv('quantile_calib.csv', index=False)

df_quantile_test = pd.DataFrame({
    'pred_lower_test': pred_lower_test,
    'pred_upper_test': pred_upper_test,
    'y_test': y_test.values
})
df_quantile_test.to_csv('quantile_test.csv', index=False)
