In [1]:
from catboost import MultiTargetCustomMetric, MultiTargetCustomObjective
import numpy as np


class HundredDollarLoss(MultiTargetCustomObjective):
    def calc_ders_multi(self, approxes, targets, weight):
        """
        approxes: logits vector x (num_securities, )
        targets: precomputed v vector (num_securities, )
        weight: data point weight
        """
        m = np.maximum(np.max(approxes), 0.)
        exp_x = np.exp(approxes - m)
        s_tilde = exp_x / (np.sum(exp_x) + np.exp(-m))

        v_s_tilde = targets * s_tilde
        grad = v_s_tilde - np.sum(v_s_tilde) * s_tilde

        M = np.outer(grad, s_tilde)
        hess = np.diag(grad) - M - M.T

        return -grad * weight, -hess * weight
    

class TwoHundredDollarLoss(MultiTargetCustomObjective):
    def calc_ders_multi(self, approxes, targets, weight):
        """
        approxes: logits vector x (num_securities, )
        targets: precomputed v vector (num_securities, )
        weight: data point weight
        """
        m = np.maximum(np.max(approxes), 0.)
        exp_x = np.exp(approxes - m)
        s_tilde = exp_x / (np.sum(exp_x) + np.exp(-m))

        v_s_tilde = targets * s_tilde
        sum_v_s_tilde = np.sum(v_s_tilde)
        grad = (v_s_tilde - sum_v_s_tilde * s_tilde) / (1. - sum_v_s_tilde)

        M = np.outer(grad, s_tilde)
        hess = np.outer(grad, grad) + np.diag(grad) - M - M.T

        return -grad * weight, -hess * weight
    

class HundredDollarMetric(MultiTargetCustomMetric):
    def get_final_error(self, error, weight):
        return np.exp(error / weight)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, targets, weights):
        """
        approxes: tuple of logits vectors x (num_securities, dataset_size)
        targets: tuple of precomputed v vectors (num_securities, dataset_size)
        weights: data point weights array (dataset_size, ) or None
        """
        approxes = np.vstack(approxes)
        # numba doesn't know np.max(approxes, axis=0)...
        m = np.zeros(approxes.shape[1])
        for i in range(approxes.shape[1]):
            m[i] = np.max(approxes[:, i])
        exp_x = np.exp(approxes - m)
        s_tilde = exp_x / (np.sum(exp_x, axis=0) + np.exp(-m))
        error_sum = np.sum(np.log1p(-np.sum(s_tilde * np.vstack(targets), axis=0)) * (1. if weights is None else weights))
        weight_sum = approxes.shape[1] if weights is None else np.sum(weights)
        
        return error_sum, weight_sum

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoost, Pool

# Игрушечный пример
m = 2000
k = 30 # кол-во признаков
n = 10 # кол-во компаний
X = np.random.randn(m, k)
c = np.exp(X[:, :n])
o = np.exp(X[:, -n:])
t = 4e-3
v = (1 + t) - (1 - t) * (c / o)

X_train, X_val, v_train, v_val = train_test_split(X, v, test_size=.1)

In [3]:
model = CatBoost(dict(
    iterations=1000,
    learning_rate=3.,
    depth=5,
    loss_function=HundredDollarLoss(), 
    verbose=100,
    eval_metric=HundredDollarMetric(),
    l2_leaf_reg=100
))

model.fit(X_train, v_train, eval_set=Pool(X_val, v_val))

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


0:	learn: 2.2954022	test: 2.1627759	best: 2.1627759 (0)	total: 1.38s	remaining: 23m 2s
100:	learn: 6.7907700	test: 5.6628679	best: 5.6631621 (99)	total: 4.3s	remaining: 38.3s
200:	learn: 7.6825929	test: 6.4984264	best: 6.5009521 (198)	total: 7.17s	remaining: 28.5s
300:	learn: 7.9952952	test: 6.6932608	best: 6.6932608 (300)	total: 9.99s	remaining: 23.2s
400:	learn: 8.1487283	test: 6.8039011	best: 6.8039011 (400)	total: 12.9s	remaining: 19.2s
500:	learn: 8.2227896	test: 6.8330758	best: 6.8406638 (486)	total: 15.7s	remaining: 15.7s
600:	learn: 8.2882753	test: 6.8526248	best: 6.8555749 (582)	total: 18.6s	remaining: 12.3s
700:	learn: 8.3338089	test: 6.8742759	best: 6.8742759 (700)	total: 21.4s	remaining: 9.12s
800:	learn: 8.3739356	test: 6.8974659	best: 6.8974981 (789)	total: 24.2s	remaining: 6s
900:	learn: 8.4121239	test: 6.9268035	best: 6.9286633 (894)	total: 27s	remaining: 2.96s
999:	learn: 8.4431976	test: 6.9456962	best: 6.9456962 (999)	total: 29.9s	remaining: 0us

bestTest = 6.94569615

<catboost.core.CatBoost at 0x1069c15b0>

In [4]:
model = CatBoost(dict(
    iterations=1000,
    learning_rate=3.,
    depth=5,
    loss_function=TwoHundredDollarLoss(), 
    verbose=100,
    eval_metric=HundredDollarMetric(),
    l2_leaf_reg=50
))

model.fit(X_train, v_train, eval_set=Pool(X_val, v_val))

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


0:	learn: 2.4416481	test: 2.2628869	best: 2.2628869 (0)	total: 556ms	remaining: 9m 15s
100:	learn: 7.5465752	test: 6.0565079	best: 6.0565079 (100)	total: 3.45s	remaining: 30.7s
200:	learn: 8.0788778	test: 6.5203030	best: 6.5203030 (200)	total: 6.3s	remaining: 25s
300:	learn: 8.2462878	test: 6.6317239	best: 6.6338972 (299)	total: 9.19s	remaining: 21.3s
400:	learn: 8.3537230	test: 6.7192960	best: 6.7192960 (400)	total: 12s	remaining: 18s
500:	learn: 8.4312422	test: 6.7850033	best: 6.7850033 (500)	total: 14.9s	remaining: 14.8s
600:	learn: 8.4822515	test: 6.8476646	best: 6.8476646 (600)	total: 17.7s	remaining: 11.8s
700:	learn: 8.5239605	test: 6.9024472	best: 6.9029902 (697)	total: 20.6s	remaining: 8.77s
800:	learn: 8.5570660	test: 6.9437441	best: 6.9437441 (800)	total: 23.4s	remaining: 5.81s
900:	learn: 8.5789611	test: 6.9708547	best: 6.9710909 (899)	total: 26.3s	remaining: 2.88s
999:	learn: 8.5973774	test: 6.9917425	best: 6.9919524 (998)	total: 29.1s	remaining: 0us

bestTest = 6.99195239

<catboost.core.CatBoost at 0x165b33020>

In [None]:
import pandas as pd
from datetime import datetime
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

names = ['Tatneft', 'Bashneft', 'Gazprom', 'Lukoil', 'Novatek', 'Rosneft']
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

dfs = []

for name in names:
    df = pd.read_csv(f"../../datasets/{name}_10_min.csv")
    df.utc = pd.to_datetime(df.utc).dt.tz_localize(None)
    df = df[df.utc >= start_date].set_index('utc').reindex(pd.date_range(start=start_date, end=end_date, freq='10Min'))
    df.close = df.close.ffill(limit_area='inside')
    df.open = df.open.fillna(df.close)
    df.high = df.high.fillna(df.close)
    df.low = df.low.fillna(df.close)
    df.volume = df.volume.fillna(0)

    feats = ['high/close', 'low/close', 'open/close', 'high/low', 'high/open', 'low/open']

    # отношения для текущей свечки
    for feat in feats:
        p1, p2 = feat.split('/')
        df[feat] = df[p1] / df[p2]

    shifts = [1, 2, 3, 4, 5, 6, 9, 12, 18, 24]
    
    for i in shifts:

        # берем попарные лаги close-ов
        df[f'close/close>{i}'] = df.close / df.close.shift(i)
        for j in shifts:
            if j >= i:
                break
            df[f'close>{j}/close>{i}'] = df.close.shift(j) / df.close.shift(i)
        
        # для всех отношений берем шифты
        for feat in feats:
            df[f'({feat})>{i}'] = df[feat].shift(i)
        
        # скользящие средние
        df[f'close/close@{i}'] = df.close / df.close.shift(1).rolling(i).mean()
        df[f'volume/volume@{i}'] = df.volume / df.volume.shift(1).rolling(i).mean()

    df.rename({col: f'{col}[{name}]' for col in df.columns}, axis=1, inplace=True)
    dfs.append(df)

df = pd.concat(dfs, axis=1)
df.dropna(inplace=True)

# добавим попарные по компаниям признаки
diff_feats = ['close/close>1', 'high/low', 'high/close']
for name1 in names:
    for name2 in names:
        if name1 < name2:
            for feat in diff_feats:
                feat1, feat2 = f'{feat}[{name1}]', f'{feat}[{name2}]'
                df[f'{feat1}-{feat2}'] = df[feat1] - df[feat2]

df.columns.to_list()

['open[Tatneft]',
 'close[Tatneft]',
 'high[Tatneft]',
 'low[Tatneft]',
 'volume[Tatneft]',
 'high/close[Tatneft]',
 'low/close[Tatneft]',
 'open/close[Tatneft]',
 'high/low[Tatneft]',
 'high/open[Tatneft]',
 'low/open[Tatneft]',
 'close/close>1[Tatneft]',
 '(high/close)>1[Tatneft]',
 '(low/close)>1[Tatneft]',
 '(open/close)>1[Tatneft]',
 '(high/low)>1[Tatneft]',
 '(high/open)>1[Tatneft]',
 '(low/open)>1[Tatneft]',
 'close/close@1[Tatneft]',
 'volume/volume@1[Tatneft]',
 'close/close>2[Tatneft]',
 'close>1/close>2[Tatneft]',
 '(high/close)>2[Tatneft]',
 '(low/close)>2[Tatneft]',
 '(open/close)>2[Tatneft]',
 '(high/low)>2[Tatneft]',
 '(high/open)>2[Tatneft]',
 '(low/open)>2[Tatneft]',
 'close/close@2[Tatneft]',
 'volume/volume@2[Tatneft]',
 'close/close>3[Tatneft]',
 'close>1/close>3[Tatneft]',
 'close>2/close>3[Tatneft]',
 '(high/close)>3[Tatneft]',
 '(low/close)>3[Tatneft]',
 '(open/close)>3[Tatneft]',
 '(high/low)>3[Tatneft]',
 '(high/open)>3[Tatneft]',
 '(low/open)>3[Tatneft]',
 'cl