In [62]:
import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost
import optuna

In [63]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]

In [64]:
eras = training_set.era.unique()

NUM_FOLDS = 5
FOLD_SIZE = int(len(eras) / NUM_FOLDS)
EMBAGO_SIZE = 64

print(eras)
print(len(eras))

['0001' '0002' '0003' '0004' '0005' '0006' '0007' '0008' '0009' '0010'
 '0011' '0012' '0013' '0014' '0015' '0016' '0017' '0018' '0019' '0020'
 '0021' '0022' '0023' '0024' '0025' '0026' '0027' '0028' '0029' '0030'
 '0031' '0032' '0033' '0034' '0035' '0036' '0037' '0038' '0039' '0040'
 '0041' '0042' '0043' '0044' '0045' '0046' '0047' '0048' '0049' '0050'
 '0051' '0052' '0053' '0054' '0055' '0056' '0057' '0058' '0059' '0060'
 '0061' '0062' '0063' '0064' '0065' '0066' '0067' '0068' '0069' '0070'
 '0071' '0072' '0073' '0074' '0075' '0076' '0077' '0078' '0079' '0080'
 '0081' '0082' '0083' '0084' '0085' '0086' '0087' '0088' '0089' '0090'
 '0091' '0092' '0093' '0094' '0095' '0096' '0097' '0098' '0099' '0100'
 '0101' '0102' '0103' '0104' '0105' '0106' '0107' '0108' '0109' '0110'
 '0111' '0112' '0113' '0114' '0115' '0116' '0117' '0118' '0119' '0120'
 '0121' '0122' '0123' '0124' '0125' '0126' '0127' '0128' '0129' '0130'
 '0131' '0132' '0133' '0134' '0135' '0136' '0137' '0138' '0139' '0140'
 '0141

In [65]:
# generate splits
splits_df = pd.DataFrame({
    'era': eras,
}).set_index("era") # list of tuples of validation and training eras

step_size = (len(eras) - (FOLD_SIZE + 2 * EMBAGO_SIZE)) // (NUM_FOLDS - 1)
for i in range(NUM_FOLDS):
    start = i * step_size
    end = start + FOLD_SIZE + 2 * EMBAGO_SIZE

    
    validation_eras = eras[(start + EMBAGO_SIZE):(end - EMBAGO_SIZE)]

    training_eras = []
    if start == 0:
        training_eras = eras[end:]
    else:
        training_eras = eras[:start]
        training_eras = np.concatenate([training_eras, eras[end:]])

    splits_df['split_{}'.format(i)] = "embargo"
    splits_df.loc[validation_eras, 'split_{}'.format(i)] = "validation"
    splits_df.loc[training_eras, 'split_{}'.format(i)] = "training"

# display splits with plotly table
splits_df


Unnamed: 0_level_0,split_0,split_1,split_2,split_3,split_4
era,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0001,embargo,training,training,training,training
0002,embargo,training,training,training,training
0003,embargo,training,training,training,training
0004,embargo,training,training,training,training
0005,embargo,training,training,training,training
...,...,...,...,...,...
0570,training,training,training,training,embargo
0571,training,training,training,training,embargo
0572,training,training,training,training,embargo
0573,training,training,training,training,embargo


In [69]:
# plot splits_df with plotly
from plotly.colors import n_colors

splits = [s for s in splits_df.columns.tolist() if "split" in s]
fig = go.Figure(data=[go.Table(
    header=dict(values=["<b>Era<b>"] + ['<b>Split {}<b>'.format(i) for i in range(NUM_FOLDS)]),
    cells=dict(values=[eras] + [splits_df['split_{}'.format(i)] for i in range(NUM_FOLDS)],
        # colors=['#FF0000', '#00FF00', '#0000FF'],
        fill_color = [
            '#FF0000' if "embargo" in splits_df['split_{}'.format(i)] else '#FFFFFF' 
            for i in range(NUM_FOLDS)
        ],
        align='center',
    ))
])

fig.show()


In [70]:
from scipy.stats import gmean

# hyper parameter tuning with optuna
def objective(trial):

    # suggest hyper parameters to try at each iteration
    model = catboost.CatBoostRegressor(
        iterations=trial.suggest_int("iterations", 100, 500),
        learning_rate=trial.suggest_loguniform("learning_rate", 1e-8, 1e-1),
        depth=trial.suggest_int("depth", 1, 7),
    )

    all_correlations = []

    # loop over each cross validation fold
    for split in splits:
        train_eras = splits_df.loc[splits_df[split] == "training"].index.tolist()
        validation_eras = splits_df.loc[splits_df[split] == "validation"].index.tolist()

        model.fit(X=training_set[train_eras][feature_names], y=training_set[train_eras]["target"])
        training_set[validation_eras]["prediction"] = model.predict(X=training_set[validation_eras][feature_names])

        era_correlations = training_set[validation_eras].groupby("era").apply(
            lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
        )
        mean_correlation = era_correlations.mean()
        all_correlations.append(mean_correlation)

    # some splits tend to have higher correlation than others
    # geometric mean prevents these splits from skewing the results
    geometric_mean_correlation = gmean(all_correlations)

    return geometric_mean_correlation

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

[32m[I 2022-02-09 19:15:28,758][0m A new study created in memory with name: no-name-a7940848-68bf-469b-a566-c306745133f3[0m
[33m[W 2022-02-09 19:15:28,760][0m Trial 0 failed because of the following error: KeyError('era')
Traceback (most recent call last):
  File "/home/jacobstahl/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'era'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jacobstahl/.local/lib/python3.8/site-packages/opt

KeyError: 'era'

In [None]:
# best params and values
print(study.best_params)
print(study.best_value)

In [None]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()