In [2]:
%load_ext IPython.extensions.autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../..')
from model import FinData
from model import train_valid_split
from model import CatboostFinModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import datetime as dt

Let's start with basic Regularization

In [6]:
data = FinData("../../datasets/T_yandex_10min.csv")

data.insert_shifts_norms()
data.insert_rolling_means()
data.insert_exp_rolling_means()
data.insert_stochastic_oscillator()
data.insert_high_low_diff()
data.insert_holidays()
data.insert_rsi()
data.insert_bollinger()
data.insert_random_prediction()

data.restrict_time_down(months=6)

X_train, X_val, y_train, y_val = train_valid_split(data=data.df, 
                                                   year=2024, month=12, day=5, 
                                                   numeric=data.numeric_features, cat=data.cat_features, target="direction_binary")

In [22]:
numeric = data.numeric_features

cat = data.cat_features

args = {"iterations" : 5000, 
        "depth" : 5, 
        "learning_rate" : 0.01, # тут слегка неадекватные параметры, которые можно менять 
        "use_best_model" : True, 
        # "l2_leaf_reg" : 200,
        "loss_function" : 'CrossEntropy', 
        "eval_metric" : 'Accuracy', 
        "cat_features" : cat, 
        "random_state" : 42,
        "early_stopping_rounds" : 5000}


model = CatboostFinModel(args)

model.set_datasets(X_train, X_val, y_train, y_val)
model.set_features(numeric, cat)

model.fit()

args2 = {**args, "reg_lambda":5} # Параметр регуляризации L2


model2 = CatboostFinModel(args2)

model2.set_datasets(X_train, X_val, y_train, y_val)
model2.set_features(numeric, cat)
model2.fit()

args3 = {**args, "reg_lambda":0.1} # Параметр регуляризации L1


model3 = CatboostFinModel(args3)

model3.set_datasets(X_train, X_val, y_train, y_val)
model3.set_features(numeric, cat)
model3.fit()

args4 = {**args, "l2_leaf_reg": 200} # Параметр регуляризации L1


model4 = CatboostFinModel(args4)

model4.set_datasets(X_train, X_val, y_train, y_val)
model4.set_features(numeric, cat)
model4.fit()

0:	learn: 0.5983831	test: 0.5909091	best: 0.5909091 (0)	total: 22.6ms	remaining: 1m 52s
1:	learn: 0.5985315	test: 0.6010101	best: 0.6010101 (1)	total: 44.2ms	remaining: 1m 50s
2:	learn: 0.5990507	test: 0.6060606	best: 0.6060606 (2)	total: 66.8ms	remaining: 1m 51s
3:	learn: 0.5977156	test: 0.6161616	best: 0.6161616 (3)	total: 87.9ms	remaining: 1m 49s
4:	learn: 0.5966773	test: 0.6212121	best: 0.6212121 (4)	total: 113ms	remaining: 1m 52s
5:	learn: 0.5991248	test: 0.6060606	best: 0.6212121 (4)	total: 132ms	remaining: 1m 50s
6:	learn: 0.6009048	test: 0.6161616	best: 0.6212121 (4)	total: 156ms	remaining: 1m 51s
7:	learn: 0.6001632	test: 0.6060606	best: 0.6212121 (4)	total: 179ms	remaining: 1m 51s
8:	learn: 0.5991248	test: 0.6212121	best: 0.6212121 (4)	total: 202ms	remaining: 1m 51s
9:	learn: 0.5980865	test: 0.6262626	best: 0.6262626 (9)	total: 220ms	remaining: 1m 49s
10:	learn: 0.5971223	test: 0.6161616	best: 0.6262626 (9)	total: 241ms	remaining: 1m 49s
11:	learn: 0.5980865	test: 0.6111111	b

<model.model.CatboostFinModel at 0x2066b1c0830>

In [23]:
print(f"Score without R2: {model.score(X_val, y_val)}")
print(f"Score with L2: {model2.score(X_val, y_val)}")
print(f"Score with L1: {model3.score(X_val, y_val)}")
print(f"Score wuth l2_leaf_reg: {model4.score(X_val, y_val)}")

Score without R2: 0.6262626262626263
Score with L2: 0.6161616161616161
Score with L1: 0.6313131313131313
Score wuth l2_leaf_reg: 0.6212121212121212


### Backward Elimination

In [27]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.feature_selection import RFE

# Инициализация CatBoostClassifier
catboost_classifier = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    random_state=42
)

X = data.df.drop(columns = ['direction_binary', 'utc'])
y = data.df['direction_binary']

# Обучение модели CatBoost
catboost_classifier.fit(X, y)

# Применение обратного исключения признаков (RFE)
n_features_to_select = 2
rfe = RFE(catboost_classifier, n_features_to_select=2)
X_reduced = rfe.fit_transform(X.values, y.values)

# Выбранные признаки
selected_features = np.array(X_reduced.T).flatten().tolist()
print("Выбранные признаки:", selected_features)

# Оценка важности признаков
feature_importances = catboost_classifier.get_feature_importance(prettified=True)
print("Важность признаков:")
print(feature_importances)

0:	learn: 0.6862894	total: 17ms	remaining: 8.46s
1:	learn: 0.6812371	total: 31.4ms	remaining: 7.83s
2:	learn: 0.6762045	total: 47.4ms	remaining: 7.85s
3:	learn: 0.6724948	total: 60.8ms	remaining: 7.54s
4:	learn: 0.6692879	total: 74.5ms	remaining: 7.37s
5:	learn: 0.6666224	total: 87.6ms	remaining: 7.21s
6:	learn: 0.6642659	total: 101ms	remaining: 7.12s
7:	learn: 0.6622290	total: 114ms	remaining: 7.01s
8:	learn: 0.6601018	total: 128ms	remaining: 6.97s
9:	learn: 0.6585711	total: 142ms	remaining: 6.95s
10:	learn: 0.6568624	total: 154ms	remaining: 6.86s
11:	learn: 0.6555167	total: 168ms	remaining: 6.82s
12:	learn: 0.6545305	total: 181ms	remaining: 6.78s
13:	learn: 0.6533161	total: 194ms	remaining: 6.73s
14:	learn: 0.6523584	total: 207ms	remaining: 6.68s
15:	learn: 0.6513615	total: 219ms	remaining: 6.63s
16:	learn: 0.6506776	total: 232ms	remaining: 6.6s
17:	learn: 0.6499173	total: 247ms	remaining: 6.62s
18:	learn: 0.6488176	total: 260ms	remaining: 6.58s
19:	learn: 0.6476205	total: 273ms	rema

In [30]:
feature_importances

Unnamed: 0,Feature Id,Importances
0,volume,5.191958
1,high_normed_ma_3,3.086723
2,high_norms_3,2.794427
3,close_normed_rsi_6,2.789395
4,low_normed_ma_3,2.713750
...,...,...
63,close_norms_18,0.291511
64,high,0.284299
65,pred_holiday,0.096639
66,is_holiday,0.000000


### Forward Elimination

In [1]:
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector

selector = SequentialFeatureSelector(
    catboost_classifier, n_features_to_select=5, scoring='accuracy')
 
# Fit the selector to the data
selector.fit(X, y)
 
# Get the selected features
selected_features = selector.get_support()
 
print('The selected features are:', list(X.columns[selected_features]))

NameError: name 'catboost_classifier' is not defined