In [1]:
import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost
import optuna

In [2]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]

In [3]:
eras = training_set.era.unique()

NUM_FOLDS = 5
FOLD_SIZE = int(len(eras) / NUM_FOLDS)
EMBAGO_SIZE = 64

['0001' '0002' '0003' '0004' '0005' '0006' '0007' '0008' '0009' '0010'
 '0011' '0012' '0013' '0014' '0015' '0016' '0017' '0018' '0019' '0020'
 '0021' '0022' '0023' '0024' '0025' '0026' '0027' '0028' '0029' '0030'
 '0031' '0032' '0033' '0034' '0035' '0036' '0037' '0038' '0039' '0040'
 '0041' '0042' '0043' '0044' '0045' '0046' '0047' '0048' '0049' '0050'
 '0051' '0052' '0053' '0054' '0055' '0056' '0057' '0058' '0059' '0060'
 '0061' '0062' '0063' '0064' '0065' '0066' '0067' '0068' '0069' '0070'
 '0071' '0072' '0073' '0074' '0075' '0076' '0077' '0078' '0079' '0080'
 '0081' '0082' '0083' '0084' '0085' '0086' '0087' '0088' '0089' '0090'
 '0091' '0092' '0093' '0094' '0095' '0096' '0097' '0098' '0099' '0100'
 '0101' '0102' '0103' '0104' '0105' '0106' '0107' '0108' '0109' '0110'
 '0111' '0112' '0113' '0114' '0115' '0116' '0117' '0118' '0119' '0120'
 '0121' '0122' '0123' '0124' '0125' '0126' '0127' '0128' '0129' '0130'
 '0131' '0132' '0133' '0134' '0135' '0136' '0137' '0138' '0139' '0140'
 '0141

In [4]:
# generate splits
splits_df = pd.DataFrame({
    'era': eras,
}).set_index("era") # list of tuples of validation and training eras

step_size = (len(eras) - (FOLD_SIZE + 2 * EMBAGO_SIZE)) // (NUM_FOLDS - 1)
for i in range(NUM_FOLDS):
    start = i * step_size
    end = start + FOLD_SIZE + 2 * EMBAGO_SIZE
    validation_eras = eras[(start + EMBAGO_SIZE):(end - EMBAGO_SIZE)]

    training_eras = []
    if start == 0:
        training_eras = eras[end:]
    else:
        training_eras = eras[:start]
        training_eras = np.concatenate([training_eras, eras[end:]])

    splits_df['split_{}'.format(i)] = "embargo"
    splits_df.loc[validation_eras, 'split_{}'.format(i)] = "validation"
    splits_df.loc[training_eras, 'split_{}'.format(i)] = "training"

# display splits with plotly table
splits_df


Unnamed: 0_level_0,split_0,split_1,split_2,split_3,split_4
era,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0001,embargo,training,training,training,training
0002,embargo,training,training,training,training
0003,embargo,training,training,training,training
0004,embargo,training,training,training,training
0005,embargo,training,training,training,training
...,...,...,...,...,...
0570,training,training,training,training,embargo
0571,training,training,training,training,embargo
0572,training,training,training,training,embargo
0573,training,training,training,training,embargo


In [5]:
# plot splits_df with plotly
from plotly.colors import n_colors

splits = [s for s in splits_df.columns.tolist() if "split" in s]
fig = go.Figure(data=[go.Table(
    header=dict(values=["<b>Era<b>"] + ['<b>Split {}<b>'.format(i) for i in range(NUM_FOLDS)]),
    cells=dict(values=[eras] + [splits_df['split_{}'.format(i)] for i in range(NUM_FOLDS)],
        # colors=['#FF0000', '#00FF00', '#0000FF'],
        fill_color = [
            '#FF0000' if "embargo" in splits_df['split_{}'.format(i)] else '#FFFFFF' 
            for i in range(NUM_FOLDS)
        ],
        align='center',
    ))
])

fig.show()


In [30]:
from scipy.stats import gmean

# hyper parameter tuning with optuna
def objective(trial):

    # suggest hyper parameters to try at each iteration
    params = {
        "iterations" : trial.suggest_int("iterations", 100, 2000),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
        "depth" : trial.suggest_int("depth", 4, 10),
        "task_type" : "GPU",
    }

    all_correlations = []

    # loop over each cross validation fold
    for split in splits:
        train_eras = splits_df.loc[splits_df[split] == "training"].index
        validation_eras = splits_df.loc[splits_df[split] == "validation"].index

        # creat model with same parameters each fold
        model = catboost.CatBoostRegressor(**params)
        model.fit(
            X=training_set.loc[training_set.era.isin(train_eras)][feature_names],
            y=training_set.loc[training_set.era.isin(train_eras)]["target"],
            verbose=False
            )
        # make predictions on validation fold
        preds_df = pd.DataFrame(
            {
                "prediction" : model.predict(
                training_set.loc[training_set.era.isin(validation_eras)][feature_names],
                verbose=False,
                ),
                "era" : training_set.loc[training_set.era.isin(validation_eras)]["era"],
                "target" : training_set.loc[training_set.era.isin(validation_eras)]["target"],
            },
            index=training_set.loc[training_set.era.isin(validation_eras)].index, 
        )

        # calculate correlation between prediction and target grouped by era
        era_correlations = preds_df.groupby("era").apply(
            lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
        )

        # mean accross all eras
        mean_correlation = era_correlations.mean()

        # append to list of all correlations
        all_correlations.append(mean_correlation)

    # some splits tend to have higher correlation than others
    # geometric mean prevents these splits from skewing the results
    geometric_mean_correlation = gmean(all_correlations)

    return geometric_mean_correlation

# hyper parameter optimization with random search
study = optuna.create_study(
    direction="maximize",
    study_name="catboost_hyper_parameter_tuning",
    sampler=optuna.samplers.RandomSampler(seed=42),
    )

# 20 trials
study.optimize(objective, n_trials=30)

[32m[I 2022-02-20 20:13:35,704][0m A new study created in memory with name: catboost_hyper_parameter_tuning[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2022-02-20 20:15:30,587][0m Trial 0 finished with value: 0.05513383984942133 and parameters: {'iterations': 250, 'learning_rate': 0.07114476009343425, 'depth': 6}. Best is trial 0 with value: 0.05513383984942133.[0m
[32m[I 2022-02-20 20:16:44,432][0m Trial 1 finished with value: 0.027973076262303692 and parameters: {'iterations': 340, 'learning_rate': 0.00029380279387035364, 'depth': 2}. Best is trial 0 with value: 0.05513383984942133.[0m
[32m[I 2022-02-20 20:17:57,492][0m Trial 2 finished with value: 0.05088369664486533 and parameters: {'iterations': 123, 'learning_rate': 0.0396760507705299, 'depth': 5}. Best is trial 0 with value: 0.05513383984942133.[0m
[32m[I 2022-02-20 20:21:02,608][0m Trial 3 finished with value: 0.03859770262580047 and parameters: {'iter

In [31]:
# best params and values
print(study.best_params)
print(study.best_value)

{'iterations': 250, 'learning_rate': 0.07114476009343425, 'depth': 6}
0.05513383984942133


In [32]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [33]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()