In [1]:
%load_ext IPython.extensions.autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../..')
from model import FinData
from model import merged_split
from model import CatboostFinModel
from model import train_valid_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import datetime as dt

In [4]:
# def make_features(data : FinData, features_settings : dict):
#     # Attention: inplace modification
#     # TODO Move to FinData class
#     features = list(features_settings.keys())
#     if "shifts_norms" in features:
#         data.insert_shifts_norms(features_settings["shifts_norms"])
#     if "ma" in features:
#         data.insert_rolling_means(features_settings["ma"])
#     if "ema" in features:
#         data.insert_exp_rolling_means(features_settings["ema"])
#     if "boll" in features:
#         data.insert_bollinger()
#     if "rsi" in features:
#         data.insert_rsi()
#     if "hl_diff" in features:
#         data.insert_high_low_diff()
#     if "stoch_osc" in features:
#         data.insert_stochastic_oscillator()
#     if "rand_pred" in features:
#         data.insert_random_prediction()

In [5]:
dfs = ["../../datasets/T_yandex_10min.csv", 
        "../../datasets/Positive_10_min.csv", 
       "../../datasets/Moex_10_min.csv", 
       "../../datasets/Gazprom_10_min.csv", 
       "../../datasets/Nornickel_10_min.csv", 
       "../../datasets/Novatek_10_min.csv", 
       "../../datasets/Phosagro_10_min.csv", 
       "../../datasets/Tatneft_10_min.csv"]

w1 = [1, 2, 3, 4, 5, 6, 7]
w2 = [2, 4, 6, 8, 10, 18, 30]

feature_settings = {
    "shifts_norms" : w1, 
    "ma" : w2, 
    "ema" : w2, 
    "boll" : 1, 
    "rsi" : 1, 
    "hl_diff" : 1, 
    "stoch_ocs" : 1, 
    "rand_pred" : 1,
}

args = {"iterations" : 2000, 
        "depth" : 5, 
        "learning_rate" : 0.01, 
        "use_best_model" : True, 
        "l2_leaf_reg" : 200,
        "loss_function" : 'Logloss', 
        "eval_metric" : 'Logloss', 
        "random_state" : 42,
        "early_stopping_rounds" : 500}

In [6]:
def make_features(data : FinData, features_settings : dict):
    # Attention: inplace modification
    # TODO Move to FinData class
    features = list(features_settings.keys())
    if "shifts_norms" in features:
        data.insert_shifts_norms(features_settings["shifts_norms"])
    if "ma" in features:
        data.insert_rolling_means(features_settings["ma"])
    if "ema" in features:
        data.insert_exp_rolling_means(features_settings["ema"])
    if "boll" in features:
        data.insert_bollinger()
    if "rsi" in features:
        data.insert_rsi()
    if "hl_diff" in features:
        data.insert_high_low_diff()
    if "stoch_osc" in features:
        data.insert_stochastic_oscillator()
    if "rand_pred" in features:
        data.insert_random_prediction()

def calculate_avret(matrix_probs : np.array, matrix_tests : np.array, norm_value):
    idx_max = np.argpartition(matrix_probs[:, :-1], -norm_value, axis=0)[-norm_value:]
    idx_min = np.argpartition(matrix_probs[:, :-1], norm_value, axis=0)[:norm_value]
    values_max_curr, values_max_next = np.take_along_axis(matrix_tests, idx_max, axis=0), np.take_along_axis(matrix_tests, idx_max + 1, axis=0)
    values_min_curr, values_min_next = np.take_along_axis(matrix_tests, idx_min, axis=0), np.take_along_axis(matrix_tests, idx_min + 1, axis=0)
    return (np.sum(values_max_next/values_max_curr))/norm_value - (np.sum(values_min_next/values_min_curr))/norm_value


In [25]:
def test_average_return(dfs_paths, start_period : dt.datetime, feature_settings : dict, args : dict,
                        train_size = 1000, 
                        val_size = 500, 
                        test_size = 500,
                        percentage = 0.2):
    
    n = len(dfs_paths)
    x = int(np.ceil(n * percentage))
    tests = []
    probs = []

    for path in dfs_paths:
        data = FinData(path)
        make_features(data, feature_settings)
        num = data.get_numeric_features()
        cat = data.get_cat_features()
        modified_args = args
        modified_args["cat_features"] = cat
        target = data.target
        X_train, X_val, X_test, y_train, y_val, y_test = train_valid_test_split(data.df, start_period, train_size, val_size, test_size, num, cat, target)
        model = CatboostFinModel(args = args)
        model.set_datasets(X_train, X_val, y_train, y_val)
        model.set_features(num, cat)
        model.fit()
        tests.append(y_test)
        probs.append(model.predict_proba(X_test)[:,1]) 

    matrix_tests = np.array(tests).squeeze(-1)
    matrix_probs = np.array(probs)

    return matrix_tests, matrix_probs

In [26]:
matrix_tests, matrix_probs = test_average_return(dfs, dt.datetime(2024, 10, 1), feature_settings=feature_settings, args=args)

0:	learn: 0.6923822	test: 0.6923911	best: 0.6923911 (0)	total: 163ms	remaining: 5m 25s
1:	learn: 0.6917189	test: 0.6918032	best: 0.6918032 (1)	total: 169ms	remaining: 2m 48s
2:	learn: 0.6912340	test: 0.6912441	best: 0.6912441 (2)	total: 174ms	remaining: 1m 55s
3:	learn: 0.6907992	test: 0.6908723	best: 0.6908723 (3)	total: 180ms	remaining: 1m 29s
4:	learn: 0.6904021	test: 0.6904293	best: 0.6904293 (4)	total: 186ms	remaining: 1m 14s
5:	learn: 0.6898491	test: 0.6899486	best: 0.6899486 (5)	total: 192ms	remaining: 1m 3s
6:	learn: 0.6892559	test: 0.6892686	best: 0.6892686 (6)	total: 203ms	remaining: 57.9s
7:	learn: 0.6888007	test: 0.6887946	best: 0.6887946 (7)	total: 210ms	remaining: 52.2s
8:	learn: 0.6882070	test: 0.6882213	best: 0.6882213 (8)	total: 216ms	remaining: 47.7s
9:	learn: 0.6876006	test: 0.6876690	best: 0.6876690 (9)	total: 221ms	remaining: 44s
10:	learn: 0.6871541	test: 0.6871256	best: 0.6871256 (10)	total: 228ms	remaining: 41.2s
11:	learn: 0.6867938	test: 0.6867393	best: 0.6867

In [30]:
matrix_tests.shape

(8, 500)

In [31]:
matrix_probs

array([[0.2025755 , 0.235041  , 0.52729353, ..., 0.47172629, 0.20250087,
        0.25032477],
       [0.610465  , 0.57357007, 0.38202614, ..., 0.24944997, 0.28926401,
        0.31158584],
       [0.29415592, 0.61519075, 0.31748765, ..., 0.26333705, 0.30091246,
        0.569216  ],
       ...,
       [0.56393875, 0.26257939, 0.30791955, ..., 0.32543094, 0.58895117,
        0.55627414],
       [0.38854037, 0.34656958, 0.47019092, ..., 0.38235336, 0.31075687,
        0.33861717],
       [0.41488122, 0.42305615, 0.46038932, ..., 0.41885409, 0.49051891,
        0.40262624]])

In [33]:
def calculate_avret(matrix_probs : np.array, matrix_tests : np.array, norm_value):
    idx_max = np.argpartition(matrix_probs, -norm_value, axis=0)[-norm_value:]
    idx_min = np.argpartition(matrix_probs, norm_value, axis=0)[:norm_value]
    values_max_curr, values_max_next = np.take_along_axis(matrix_tests, idx_max, axis=0), np.take_along_axis(matrix_tests, idx_max, axis=0)
    values_min_curr, values_min_next = np.take_along_axis(matrix_tests, idx_min, axis=0), np.take_along_axis(matrix_tests, idx_min, axis=0)
    return (np.sum(values_max_next/values_max_curr))/norm_value - (np.sum(values_min_next/values_min_curr))/norm_value

In [34]:
n = len(dfs)
percentage = 0.2
x = int(np.ceil(n * percentage))

calculate_avret(matrix_probs, matrix_tests, x)


TypeError: 'tuple' object is not callable