In [3]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import sys
sys.path.append("../src")
import os

import pandas as pd
import numpy as np
from etf_transformations import *
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

In [5]:
df = pd.read_csv('../data/preprocessed/agg_emb/XLE_v4.csv')

df = sign_next_day(df)
df = drop_sign_and_return(df)

df.head()

Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,emb_0,emb_1,emb_2,...,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383,is_trading_day,Sign_next_day
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,-0.034911,-0.009287,0.05663,...,-0.037106,0.01839,-0.048215,0.002478,-0.04092,-0.071706,0.038703,0.072361,1.0,1.0
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,-0.085665,0.049102,0.056413,...,-0.023607,-0.038444,-0.080668,-0.023549,0.033013,-0.076677,0.025544,0.023494,1.0,-1.0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0.009761,0.079683,-0.018089,...,-0.021881,0.021531,0.038033,0.0021,0.021596,-0.00276,-0.006045,-0.022443,1.0,-1.0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,-0.083923,0.038409,0.053812,...,-0.066203,0.016252,-0.058235,0.00156,-0.006853,-0.029184,0.048074,0.090239,1.0,1.0
4,2018-03-26,48.717216,0.211071,0.502181,0.286748,3.0,-0.075677,-0.023936,0.040512,-0.017666,...,-0.049532,0.031493,-0.053943,-0.040137,-0.023122,-0.080216,-0.028291,0.058632,1.0,-1.0


In [6]:
df["Date"] = pd.to_datetime(df["Date"]).dt.normalize()
df = df.sort_values("Date").reset_index(drop=True)

df["Return_next_day"] = np.log(df["Price"].shift(-1) / df["Price"])
df["AbsReturn_next_day"] = df["Return_next_day"].abs()

df["no_news"] = df["n_XLE"].isna().astype(int) # a day where there were no sector specific news
df.head(10)


Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,emb_0,emb_1,emb_2,...,emb_379,emb_380,emb_381,emb_382,emb_383,is_trading_day,Sign_next_day,Return_next_day,AbsReturn_next_day,no_news
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,-0.034911,-0.009287,0.05663,...,0.002478,-0.04092,-0.071706,0.038703,0.072361,1.0,1.0,0.025849,0.025849,0
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,-0.085665,0.049102,0.056413,...,-0.023549,0.033013,-0.076677,0.025544,0.023494,1.0,-1.0,-0.020626,0.020626,0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0.009761,0.079683,-0.018089,...,0.0021,0.021596,-0.00276,-0.006045,-0.022443,1.0,-1.0,-0.006122,0.006122,0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,-0.083923,0.038409,0.053812,...,0.00156,-0.006853,-0.029184,0.048074,0.090239,1.0,1.0,0.017813,0.017813,0
4,2018-03-26,48.717216,0.211071,0.502181,0.286748,3.0,-0.075677,-0.023936,0.040512,-0.017666,...,-0.040137,-0.023122,-0.080216,-0.028291,0.058632,1.0,-1.0,-0.009461,0.009461,0
5,2018-03-27,48.258488,0.103827,0.877021,0.019152,2.0,0.084675,-0.041524,-0.013985,0.035771,...,-0.031458,-0.036633,-0.07415,-0.019721,0.054707,1.0,-1.0,-0.019648,0.019648,0
6,2018-03-28,47.319557,,,,,,,,,...,,,,,,,1.0,0.020836,0.020836,1
7,2018-03-29,48.315845,0.015619,0.060177,0.924203,2.0,-0.908584,0.043158,0.042428,0.058186,...,-0.072819,-0.024646,-0.081047,0.048397,0.021529,1.0,-1.0,-0.02023,0.02023,0
8,2018-04-02,47.348217,0.552455,0.306189,0.141357,8.0,0.411098,-0.012684,-0.031329,0.033414,...,-0.021097,-0.055515,-0.058128,0.014756,0.053678,1.0,1.0,0.02112,0.02112,0
9,2018-04-03,48.358829,0.866997,0.101776,0.031226,2.0,0.835771,-0.028622,-0.046466,0.002435,...,0.002219,0.006495,-0.081839,0.05789,0.046853,1.0,-1.0,-0.001335,0.001335,0


In [7]:
q = 0.70  # try 0.70, 0.75, 0.80
tau = df["AbsReturn_next_day"].quantile(q)
df["HighVol_next_day"] = (df["AbsReturn_next_day"] > tau).astype(int)


df["abs_ret_t"] = np.log(df["Price"] / df["Price"].shift(1)).abs()
df["vol_5"] = df["abs_ret_t"].rolling(5).mean()


df = df.dropna(subset=["HighVol_next_day"])
df.head(10)

Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,emb_0,emb_1,emb_2,...,emb_382,emb_383,is_trading_day,Sign_next_day,Return_next_day,AbsReturn_next_day,no_news,HighVol_next_day,abs_ret_t,vol_5
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,-0.034911,-0.009287,0.05663,...,0.038703,0.072361,1.0,1.0,0.025849,0.025849,0,1,,
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,-0.085665,0.049102,0.056413,...,0.025544,0.023494,1.0,-1.0,-0.020626,0.020626,0,1,0.025849,
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0.009761,0.079683,-0.018089,...,-0.006045,-0.022443,1.0,-1.0,-0.006122,0.006122,0,0,0.020626,
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,-0.083923,0.038409,0.053812,...,0.048074,0.090239,1.0,1.0,0.017813,0.017813,0,1,0.006122,
4,2018-03-26,48.717216,0.211071,0.502181,0.286748,3.0,-0.075677,-0.023936,0.040512,-0.017666,...,-0.028291,0.058632,1.0,-1.0,-0.009461,0.009461,0,0,0.017813,
5,2018-03-27,48.258488,0.103827,0.877021,0.019152,2.0,0.084675,-0.041524,-0.013985,0.035771,...,-0.019721,0.054707,1.0,-1.0,-0.019648,0.019648,0,1,0.009461,0.015974
6,2018-03-28,47.319557,,,,,,,,,...,,,,1.0,0.020836,0.020836,1,1,0.019648,0.014734
7,2018-03-29,48.315845,0.015619,0.060177,0.924203,2.0,-0.908584,0.043158,0.042428,0.058186,...,0.048397,0.021529,1.0,-1.0,-0.02023,0.02023,0,1,0.020836,0.014776
8,2018-04-02,47.348217,0.552455,0.306189,0.141357,8.0,0.411098,-0.012684,-0.031329,0.033414,...,0.014756,0.053678,1.0,1.0,0.02112,0.02112,0,1,0.02023,0.017598
9,2018-04-03,48.358829,0.866997,0.101776,0.031226,2.0,0.835771,-0.028622,-0.046466,0.002435,...,0.05789,0.046853,1.0,-1.0,-0.001335,0.001335,0,0,0.02112,0.018259


In [8]:
df.isna().sum()

Date                   0
Price                  0
avg_positive_XLE      23
avg_neutral_XLE       23
avg_negative_XLE      23
                      ..
AbsReturn_next_day     1
no_news                0
HighVol_next_day       0
abs_ret_t              1
vol_5                  5
Length: 399, dtype: int64

In [9]:
rows_with_nan = df[df.isna().any(axis=1)]
rows_with_nan


Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,emb_0,emb_1,emb_2,...,emb_382,emb_383,is_trading_day,Sign_next_day,Return_next_day,AbsReturn_next_day,no_news,HighVol_next_day,abs_ret_t,vol_5
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,-0.034911,-0.009287,0.05663,...,0.038703,0.072361,1.0,1.0,0.025849,0.025849,0,1,,
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,-0.085665,0.049102,0.056413,...,0.025544,0.023494,1.0,-1.0,-0.020626,0.020626,0,1,0.025849,
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0.009761,0.079683,-0.018089,...,-0.006045,-0.022443,1.0,-1.0,-0.006122,0.006122,0,0,0.020626,
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,-0.083923,0.038409,0.053812,...,0.048074,0.090239,1.0,1.0,0.017813,0.017813,0,1,0.006122,
4,2018-03-26,48.717216,0.211071,0.502181,0.286748,3.0,-0.075677,-0.023936,0.040512,-0.017666,...,-0.028291,0.058632,1.0,-1.0,-0.009461,0.009461,0,0,0.017813,
6,2018-03-28,47.319557,,,,,,,,,...,,,,1.0,0.020836,0.020836,1,1,0.019648,0.014734
30,2018-05-02,52.795506,,,,,,,,,...,,,,-1.0,-0.002311,0.002311,1,0,0.004081,0.007084
83,2018-07-18,54.212856,,,,,,,,,...,,,,1.0,0.000266,0.000266,1,0,0.000666,0.004647
85,2018-07-20,54.039669,,,,,,,,,...,,,,-1.0,-0.004416,0.004416,1,0,0.003466,0.003998
94,2018-08-02,54.573631,,,,,,,,,...,,,,-1.0,-0.00517,0.00517,1,0,0.005538,0.006762


In [10]:
df = df.dropna()
df.isna().sum()

Date                  0
Price                 0
avg_positive_XLE      0
avg_neutral_XLE       0
avg_negative_XLE      0
                     ..
AbsReturn_next_day    0
no_news               0
HighVol_next_day      0
abs_ret_t             0
vol_5                 0
Length: 399, dtype: int64

In [11]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

drop_cols = ["Date", "HighVol_next_day", "Return_next_day", "AbsReturn_next_day", "Return", "Sign"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])
y = df["HighVol_next_day"]

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=12,
    verbose=0,
    allow_writing_files=False
)

param_distributions = {
    "depth": randint(3, 7),
    "learning_rate": uniform(0.01, 0.19),
    "iterations": randint(100, 500),
    "l2_leaf_reg": uniform(1, 14),
    "border_count": randint(32, 128),
    "random_strength": uniform(0.5, 5),
    "min_data_in_leaf": randint(5, 30),
    "rsm": uniform(0.6, 0.4),
    "boosting_type": ["Ordered", "Plain"],
    "bootstrap_type": ["Bernoulli"],# , "MVS"],
    "subsample": uniform(0.6, 0.4),
}

tscv = TimeSeriesSplit(n_splits=5)

random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_distributions,
    n_iter=40,
    cv=tscv,
    scoring="roc_auc",
    random_state=12,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

random_search.fit(X, y)

print(f"Best AUC: {random_search.best_score_:.4f}")
print("Best parameters:")
for p, v in random_search.best_params_.items():
    print(f"  {p}: {v}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits


KeyboardInterrupt: 

In [None]:
q = 0.80  # try 0.70, 0.75, 0.80
tau = df["AbsReturn_next_day"].quantile(q)
df["HighVol_next_day"] = (df["AbsReturn_next_day"] > tau).astype(int)


df["abs_ret_t"] = np.log(df["Price"] / df["Price"].shift(1)).abs()
df["vol_5"] = df["abs_ret_t"].rolling(5).mean()


df = df.dropna(subset=["HighVol_next_day"])
df.head(10)

Unnamed: 0,Date,Price,avg_positive_XLF,avg_neutral_XLF,avg_negative_XLF,n_XLF,sent_index_XLF,emb_0,emb_1,emb_2,...,emb_382,emb_383,is_trading_day,Sign_next_day,Return_next_day,AbsReturn_next_day,no_news,HighVol_next_day,abs_ret_t,vol_5
5,2018-03-27,23.511696,0.208302,0.50128,0.290419,8.0,-0.082117,0.008858,-0.04492,-0.005257,...,0.010091,0.004574,1.0,1.0,0.00184,0.00184,0,0,,
6,2018-03-28,23.554998,0.092945,0.652902,0.254153,8.0,-0.161208,0.0037,-0.01764,0.008789,...,-0.006516,0.024834,1.0,1.0,0.013511,0.013511,0,0,0.00184,
7,2018-03-29,23.875416,0.055242,0.592329,0.352428,5.0,-0.297186,1.3e-05,-0.01325,-0.006634,...,0.012592,0.008806,1.0,-1.0,-0.022374,0.022374,0,1,0.013511,
8,2018-04-02,23.347151,0.046485,0.476633,0.476883,6.0,-0.430398,-0.013063,-0.034718,-0.003257,...,0.016507,-0.010199,1.0,1.0,0.013631,0.013631,0,0,0.022374,
9,2018-04-03,23.667574,0.627429,0.324922,0.047649,3.0,0.579779,0.022106,-0.012358,0.013611,...,-0.007063,0.020872,1.0,1.0,0.010555,0.010555,0,0,0.013631,
10,2018-04-04,23.918718,0.050521,0.906664,0.042815,2.0,0.007707,-0.049135,-0.054272,-0.049306,...,0.025393,-0.024925,1.0,1.0,0.007934,0.007934,0,0,0.010555,0.012382
11,2018-04-05,24.109234,0.273679,0.516308,0.210013,7.0,0.063665,-0.016982,0.025092,-0.005531,...,0.034368,-0.010119,1.0,-1.0,-0.02436,0.02436,0,1,0.007934,0.013601
12,2018-04-06,23.529022,0.33359,0.415331,0.251079,8.0,0.082511,-0.020975,-0.025775,-0.001227,...,0.018157,0.002133,1.0,1.0,0.005506,0.005506,0,0,0.02436,0.015771
13,2018-04-09,23.65892,0.09478,0.605769,0.299451,14.0,-0.204671,-0.001426,-0.021839,-0.016479,...,0.016951,0.010751,1.0,1.0,0.014896,0.014896,0,0,0.005506,0.012397
14,2018-04-10,24.013977,0.214749,0.428031,0.35722,6.0,-0.142471,-0.017991,-0.034595,-0.009132,...,0.056691,-0.015679,1.0,-1.0,-0.011972,0.011972,0,0,0.014896,0.01265


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

drop_cols = ["Date", "HighVol_next_day", "Return_next_day", "AbsReturn_next_day", "Return", "Sign"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])
y = df["HighVol_next_day"]

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=12,
    verbose=0,
    allow_writing_files=False
)

param_distributions = {
    "depth": randint(3, 7),
    "learning_rate": uniform(0.01, 0.19),
    "iterations": randint(100, 500),
    "l2_leaf_reg": uniform(1, 14),
    "border_count": randint(32, 128),
    "random_strength": uniform(0.5, 5),
    "min_data_in_leaf": randint(5, 30),
    "rsm": uniform(0.6, 0.4),
    "boosting_type": ["Ordered", "Plain"],
    "bootstrap_type": ["Bernoulli"],# , "MVS"],
    "subsample": uniform(0.6, 0.4),
}

tscv = TimeSeriesSplit(n_splits=5)

random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_distributions,
    n_iter=40,
    cv=tscv,
    scoring="roc_auc",
    random_state=12,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

random_search.fit(X, y)

print(f"Best AUC: {random_search.best_score_:.4f}")
print("Best parameters:")
for p, v in random_search.best_params_.items():
    print(f"  {p}: {v}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best AUC: 0.6077
Best parameters:
  boosting_type: Plain
  bootstrap_type: Bernoulli
  border_count: 65
  depth: 5
  iterations: 219
  l2_leaf_reg: 7.795283791277936
  learning_rate: 0.15594548927224025
  min_data_in_leaf: 23
  random_strength: 2.780254616013186
  rsm: 0.8252564550066286
  subsample: 0.9205061529136007


In [None]:
q = 0.75  # try 0.70, 0.75, 0.80
tau = df["AbsReturn_next_day"].quantile(q)
df["HighVol_next_day"] = (df["AbsReturn_next_day"] > tau).astype(int)


df["abs_ret_t"] = np.log(df["Price"] / df["Price"].shift(1)).abs()
df["vol_5"] = df["abs_ret_t"].rolling(5).mean()


df = df.dropna(subset=["HighVol_next_day"])
df.head(10)

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform

drop_cols = ["Date", "HighVol_next_day", "Return_next_day", "AbsReturn_next_day", "Return", "Sign"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])
y = df["HighVol_next_day"]

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=12,
    verbose=0,
    allow_writing_files=False
)

param_distributions = {
    "depth": randint(3, 7),
    "learning_rate": uniform(0.01, 0.19),
    "iterations": randint(100, 500),
    "l2_leaf_reg": uniform(1, 14),
    "border_count": randint(32, 128),
    "random_strength": uniform(0.5, 5),
    "min_data_in_leaf": randint(5, 30),
    "rsm": uniform(0.6, 0.4),
    "boosting_type": ["Ordered", "Plain"],
    "bootstrap_type": ["Bernoulli"],# , "MVS"],
    "subsample": uniform(0.6, 0.4),
}

tscv = TimeSeriesSplit(n_splits=5)

random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_distributions,
    n_iter=40,
    cv=tscv,
    scoring="roc_auc",
    random_state=12,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

random_search.fit(X, y)

print(f"Best AUC: {random_search.best_score_:.4f}")
print("Best parameters:")
for p, v in random_search.best_params_.items():
    print(f"  {p}: {v}")
