In [18]:
import catboost
import pandas as pd
import numpy as np
import plotly_express as px
from scipy import stats

trainset = pd.read_parquet("data/numerai_training_data.parquet")
validset = pd.read_parquet("data/numerai_validation_data.parquet")
feature_names = [f for f in trainset.columns if "feature_" in f]

params = {
    "iterations":1000,
    "learning_rate":0.01,
    "depth":6,
    "task_type":'GPU',
}

model = catboost.CatBoostRegressor(**params)
model.fit(trainset[feature_names], trainset["target"])
validset["base_preds"] = model.predict(validset[feature_names])

0:	learn: 0.2236040	total: 94ms	remaining: 1m 33s
1:	learn: 0.2236000	total: 185ms	remaining: 1m 32s
2:	learn: 0.2235966	total: 283ms	remaining: 1m 34s
3:	learn: 0.2235934	total: 383ms	remaining: 1m 35s
4:	learn: 0.2235906	total: 473ms	remaining: 1m 34s
5:	learn: 0.2235875	total: 566ms	remaining: 1m 33s
6:	learn: 0.2235847	total: 664ms	remaining: 1m 34s
7:	learn: 0.2235819	total: 750ms	remaining: 1m 33s
8:	learn: 0.2235788	total: 837ms	remaining: 1m 32s
9:	learn: 0.2235760	total: 936ms	remaining: 1m 32s
10:	learn: 0.2235731	total: 1.03s	remaining: 1m 32s
11:	learn: 0.2235704	total: 1.13s	remaining: 1m 32s
12:	learn: 0.2235677	total: 1.23s	remaining: 1m 33s
13:	learn: 0.2235652	total: 1.32s	remaining: 1m 33s
14:	learn: 0.2235624	total: 1.42s	remaining: 1m 33s
15:	learn: 0.2235601	total: 1.52s	remaining: 1m 33s
16:	learn: 0.2235577	total: 1.61s	remaining: 1m 33s
17:	learn: 0.2235552	total: 1.7s	remaining: 1m 32s
18:	learn: 0.2235523	total: 1.81s	remaining: 1m 33s
19:	learn: 0.2235499	tot

In [19]:
def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n
    
def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)

In [29]:
neutralization_proportions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
n_riskiests = [20, 50, 150, 300, 500, 700, 900, len(feature_names)]

all_feature_corrs = validset.groupby("era").apply(
    lambda era: era[feature_names].corrwith(era["base_preds"])
)

y_corr_outputs = []
y_sharpe_outputs = []
for i, neutralization_proportion in enumerate(neutralization_proportions):
    x_corr_outputs = []
    x_sharpe_outputs = []
    for j, n_riskiest in enumerate(n_riskiests):
        riskiest_features = get_biggest_change_features(all_feature_corrs, n_riskiest)
        validset["modified_preds"] = neutralize(
            validset,
            ["base_preds"],
            neutralizers=riskiest_features,
            proportion=neutralization_proportion,
            normalize=True,
        )

        era_wise_correlations = validset.groupby("era").apply(
            lambda era: np.corrcoef(era["modified_preds"], era["target"])[0, 1]
        )
        mean_corr = era_wise_correlations.mean()
        # calculate sharpe ratio
        sharpe_ratio = (mean_corr) / era_wise_correlations.std()

        x_corr_outputs.append(mean_corr)
        x_sharpe_outputs.append(sharpe_ratio)

    y_corr_outputs.append(x_corr_outputs)
    y_sharpe_outputs.append(x_sharpe_outputs)
        

In [34]:
corr_outputs = np.array(y_corr_outputs)
sharpe_outputs = np.array(y_sharpe_outputs)

# heatmap
# aspect ratio square
# display values inside each cell
fig = px.imshow(corr_outputs,
                x=n_riskiests,
                y=neutralization_proportions,
                color_continuous_scale=px.colors.sequential.Plasma,
                title="Correlation with Targets",
                labels={"x": "Risky Features", "y": "Neutralization Proportion"},
                width=800,
                height=800,
                aspect=1,
)
fig.show()

In [36]:
fig = px.imshow(sharpe_outputs,
                x=n_riskiests,
                y=neutralization_proportions,
                color_continuous_scale=px.colors.sequential.Plasma,
                title="Sharpe Ratios",
                labels={"x": "Risky Features", "y": "Neutralization Proportion"},
                width=800,
                height=800,
                aspect=1,
)
fig.show()