In [2]:
import sys
sys.path.extend(['/Users/j4yzer/PycharmProjects/VKR'])
from utils.ml_data_provider import SectoralDataProvider

from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import numpy as np
import pandas as pd

In [3]:
data_provider = SectoralDataProvider(cache_path='/Users/j4yzer/PycharmProjects/VKR/data/sectoral_ml')
data : pd.DataFrame = data_provider.load_data()

data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d', utc=False)

data = data.replace([-np.Inf, np.Inf], np.nan)
data = data.dropna()
data_by_sector = {sector: sector_data for sector, sector_data in data.groupby('sector')}
for sector, sector_data in data_by_sector.items():
    sector_data = sector_data.groupby("date").filter(lambda x: len(x) > 30)
    sector_data = sector_data[sector_data.groupby('ticker')['date'].transform('nunique') == sector_data['date'].nunique()]
    sector_data['nextPeriodRelativeToSectoralIndexReturn'] =  sector_data.groupby("date")["nextPeriodRelativeToSectoralIndexReturn"].rank("dense", ascending=True).astype(int)
    sector_data.rename(columns={'nextPeriodRelativeToSectoralIndexReturn': 'nextPeriodRank'}, inplace=True)
    sector_data['relativeToSectoralIndexReturn'] = sector_data.groupby('date')['relativeToSectoralIndexReturn'].rank('dense', ascending=True).astype(int)
    sector_data.rename(columns={'relativeToSectoralIndexReturn':'rank'}, inplace=True)
    data_by_sector[sector] = sector_data
energy_data = data_by_sector['Energy']

energy_data['nextPeriodRank'] = energy_data['nextPeriodRank'] / energy_data['nextPeriodRank'].max()
energy_data['rank'] = energy_data['rank'] / energy_data['rank'].max()
energy_data : pd.DataFrame = energy_data
energy_data['qid'] = energy_data['date'].astype('int64')

In [4]:
time_config = {'train': '2000-01-01', 'valid': '2014-01-01', 'test': '2018-01-01'}
train_energy_data = energy_data[
    (energy_data['date'] > time_config['train']) & (energy_data['date'] <= time_config['valid'])]
test_energy_data = energy_data[
    (energy_data['date'] > time_config['valid']) & (energy_data['date'] <= time_config['test'])]

y_train = train_energy_data[['nextPeriodRank']]
X_train = train_energy_data[train_energy_data.drop(columns=['ticker', 'sector',
                                                            'closePrice',
                                                            'sectoralIndex', 'nextPeriodRank', 'date', 'qid']).columns]
queries_train = train_energy_data[['qid']]

y_test = test_energy_data[['nextPeriodRank']]
X_test = test_energy_data[test_energy_data.drop(columns=['ticker', 'sector',
                                                         'closePrice',
                                                         'sectoralIndex', 'nextPeriodRank', 'date', 'qid']).columns]
queries_test = test_energy_data[['qid']]

In [5]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)
default_parameters = {
    'iterations': 100,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0
}

parameters = {}
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

# Try different models

In [6]:
model_str = fit_model('StochasticRank:metric=PFound', {'custom_metric': ['NDCG', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
model_str.get_best_score()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'learn': {'MAP:top=10': 0.698423772609819,
  'RecallAt:top=10': 0.344149278426326,
  'PrecisionAt:top=10': 0.7906976744186046},
 'validation': {'NDCG:type=Base': 0.853528658765964,
  'PFound': 0.9270071650767306,
  'MAP:top=10': 0.36848462301587304,
  'RecallAt:top=10': 0.2391304347826087,
  'PrecisionAt:top=10': 0.55}}