In [91]:
import sys
sys.path.append("../src")
import os

import pandas as pd
from etf_transformations import *
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from scipy.stats import randint, uniform, loguniform

In [92]:
df = pd.read_csv("../data/preprocessed/final_etf_data/no_weekends_no_embedding/XLE_v1.csv")
df = sign_next_day(df)
df = drop_sign_and_return(df)


In [93]:
df.head(10)

Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,1.0
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,-1.0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,-1.0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,1.0
4,2018-03-26,48.717216,0.031437,0.548499,0.420064,2.0,-0.388626,-1.0
5,2018-03-27,48.258488,0.103827,0.877021,0.019152,2.0,0.084675,-1.0
6,2018-03-28,47.319557,,,,,,1.0
7,2018-03-29,48.315845,0.015619,0.060177,0.924203,2.0,-0.908584,-1.0
8,2018-04-02,47.348217,0.443142,0.375717,0.181141,6.0,0.262,1.0
9,2018-04-03,48.358829,0.866997,0.101776,0.031226,2.0,0.835771,-1.0


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              587 non-null    object 
 1   Price             587 non-null    float64
 2   avg_positive_XLE  563 non-null    float64
 3   avg_neutral_XLE   563 non-null    float64
 4   avg_negative_XLE  563 non-null    float64
 5   n_XLE             563 non-null    float64
 6   sent_index_XLE    563 non-null    float64
 7   Sign_next_day     586 non-null    float64
dtypes: float64(7), object(1)
memory usage: 36.8+ KB


In [95]:
df.describe()

Unnamed: 0,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day
count,587.0,563.0,563.0,563.0,563.0,563.0,586.0
mean,45.020384,0.268008,0.285343,0.446649,4.348135,-0.178641,0.02901
std,8.338886,0.203307,0.189575,0.251328,3.531698,0.416005,0.996152
min,18.770016,0.009654,0.01332,0.008611,1.0,-0.946141,-1.0
25%,43.111204,0.101534,0.125518,0.277982,2.0,-0.47158,-1.0
50%,46.764362,0.23251,0.278635,0.436959,3.0,-0.183974,1.0
75%,49.901524,0.387973,0.407064,0.625236,5.0,0.064459,1.0
max,56.558403,0.952804,0.920443,0.962437,28.0,0.935071,1.0


In [96]:
df.isna().sum()

Date                 0
Price                0
avg_positive_XLE    24
avg_neutral_XLE     24
avg_negative_XLE    24
n_XLE               24
sent_index_XLE      24
Sign_next_day        1
dtype: int64

In [97]:
df.nunique()

Date                587
Price               571
avg_positive_XLE    563
avg_neutral_XLE     563
avg_negative_XLE    563
n_XLE                23
sent_index_XLE      563
Sign_next_day         3
dtype: int64

### defining target - I want to assign:

class 1 to sign_next_day ==1

and class 0 to sign_next_day âˆˆ {0, -1}

In [98]:
df = df.dropna(subset=["Sign_next_day"])
df["Sign_next_day"] = (df["Sign_next_day"] == 1).astype(int)
df.head()

Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,1
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,1
4,2018-03-26,48.717216,0.031437,0.548499,0.420064,2.0,-0.388626,0


In [99]:
df.shape

(586, 8)

In [100]:
df.isna().sum()

Date                 0
Price                0
avg_positive_XLE    24
avg_neutral_XLE     24
avg_negative_XLE    24
n_XLE               24
sent_index_XLE      24
Sign_next_day        0
dtype: int64

# Model training (FINALLY!)

Throughout the process of writing all this code I've been considering which models to use, for now I've decided to try Logistic Regression as my baseline, since it's standard in finance and interpretable, but most likely will be outperformed by other models.

Another model that I want to try as my "main" model is CatBoost since it handles missing values natively, it's strong on small datasets and from my experience usually beats other tree boosting models, at least on smaller datasets.

The last model I might try is LightGBM, since it's very fast, also handles missing values and I think it will be interesting to compare an alternative gradient boosting technique with a different strategy for tree growth.


I initially also wanted to try Random Forest, but I think my datasets are too small and it will not be as interesting to explain theoretically. 

Another model I will also omit for now is SVM. I thought it would be good, especially with RBF since SVM is perfect for binary classification and does well with this data size, but I am worried about the scaling, handling missing values and it will most likely be very hard to tune properly, so swapping it for one of the boosting models might be a mistake. I think it will be very interesting to see if tree structures improve predictive performance over other models (in the context of sentiment-based prediction in finance, at least :) ).

## CatBoost

In [101]:
df_catboost = df.copy()
df_catboost.head()

Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,1
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,1
4,2018-03-26,48.717216,0.031437,0.548499,0.420064,2.0,-0.388626,0


In [102]:
df_catboost["Date"] = pd.to_datetime(df_catboost["Date"]).dt.normalize()
df_catboost = df_catboost.sort_values("Date").reset_index(drop=True)

df_catboost["no_news"] = df_catboost["n_XLE"].isna().astype(int) # a day where there were no sector specific news
df_catboost.head(10)


Unnamed: 0,Date,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day,no_news
0,2018-03-20,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,1,0
1,2018-03-21,49.154427,0.072957,0.896156,0.030887,2.0,0.042071,0,0
2,2018-03-22,48.150974,0.088483,0.875179,0.036338,2.0,0.052145,0,0
3,2018-03-23,47.857117,0.427724,0.518749,0.053528,2.0,0.374196,1,0
4,2018-03-26,48.717216,0.031437,0.548499,0.420064,2.0,-0.388626,0,0
5,2018-03-27,48.258488,0.103827,0.877021,0.019152,2.0,0.084675,0,0
6,2018-03-28,47.319557,,,,,,1,1
7,2018-03-29,48.315845,0.015619,0.060177,0.924203,2.0,-0.908584,0,0
8,2018-04-02,47.348217,0.443142,0.375717,0.181141,6.0,0.262,1,0
9,2018-04-03,48.358829,0.866997,0.101776,0.031226,2.0,0.835771,0,0


In [103]:
df_catboost.columns

Index(['Date', 'Price', 'avg_positive_XLE', 'avg_neutral_XLE',
       'avg_negative_XLE', 'n_XLE', 'sent_index_XLE', 'Sign_next_day',
       'no_news'],
      dtype='object')

In [104]:
df_catboost = df_catboost.drop(columns=["Date"])
df_catboost.head(1)

Unnamed: 0,Price,avg_positive_XLE,avg_neutral_XLE,avg_negative_XLE,n_XLE,sent_index_XLE,Sign_next_day,no_news
0,47.900124,0.617495,0.364768,0.017737,3.0,0.599758,1,0


In [105]:
# selecting all the columns besides date for my features since i want to treat each day as a separate event and time is not necessary

features = ['Price', 'avg_positive_XLE', 'avg_neutral_XLE',
       'avg_negative_XLE', 'n_XLE', 'sent_index_XLE',
       'no_news']

X = df_catboost[features]
y = df_catboost["Sign_next_day"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=12)



In [106]:
os.makedirs("cb_tmp", exist_ok=True)

In [107]:
%%time

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=12,
    verbose=0,
    train_dir="cb_tmp",
    allow_writing_files=False
)


param_distributions = {
    'depth': randint(2, 8),
    'learning_rate': uniform(0.01, 0.19),
    'iterations': randint(50, 500),
    'l2_leaf_reg': uniform(1, 14),
    #'bagging_temperature': uniform(0, 1),
    'border_count': randint(32, 128),
    'random_strength': uniform(0, 5),
    'subsample': uniform(0.6, 0.4),
    'min_data_in_leaf': randint(5, 30),
    'rsm': uniform(0.6, 0.4),
    'boosting_type': ['Ordered', 'Plain'],
    'bootstrap_type': ['Bernoulli', 'MVS']
}


random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    random_state=12,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)


random_search.fit(X, y)
print(f"Best AUC: {random_search.best_score_:.4f}")
print(f"Best parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best AUC: 0.5132
Best parameters: {'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'border_count': 73, 'depth': 3, 'iterations': 163, 'l2_leaf_reg': 4.851120967884677, 'learning_rate': 0.014000924908758702, 'min_data_in_leaf': 26, 'random_strength': 2.3675168170099496, 'rsm': 0.9522201207310926, 'subsample': 0.6682795214886066}
CPU times: total: 1.77 s
Wall time: 2min 27s
