In [1]:
#install numerapi
!pip install --upgrade pip
!pip install --upgrade numerapi

Collecting pip
  Downloading pip-22.0.3-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 1.4 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.0.1
    Uninstalling pip-21.0.1:
      Successfully uninstalled pip-21.0.1
Successfully installed pip-22.0.3
Collecting numerapi
  Downloading numerapi-2.9.4-py3-none-any.whl (26 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.9.4


In [3]:
import pandas as pd
import numpy as np
import numerapi
import os

# make /data directory if it doesn't exist
if not os.path.exists("data"):
    os.mkdir("data")
    
# download data using numerapi
# https://pypi.org/project/numerapi/
napi = numerapi.NumerAPI(verbosity="info")
napi.download_dataset("numerai_training_data.parquet", "data/numerai_training_data.parquet")


2022-02-04 14:56:46,387 INFO numerapi.utils: starting download


In [4]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")


data/numerai_training_data.parquet: 1.01GB [04:48, 3.51MB/s]                            


In [5]:
training_set.head()

Unnamed: 0_level_0,era,data_type,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,feature_demisable_expiring_millepede,...,target_paul_20,target_paul_60,target_george_20,target_george_60,target_william_20,target_william_60,target_arthur_20,target_arthur_60,target_thomas_20,target_thomas_60
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,1,train,1.0,0.5,1.0,1.0,0.0,0.0,1.0,1.0,...,0.25,0.25,0.25,0.0,0.166667,0.0,0.166667,0.0,0.166667,0.0
n003bee128c2fcfc,1,train,0.5,1.0,0.25,0.75,0.0,0.75,0.5,0.75,...,1.0,1.0,1.0,1.0,0.833333,0.666667,0.833333,0.666667,0.833333,0.666667
n0048ac83aff7194,1,train,0.5,0.25,0.75,0.0,0.75,0.0,0.75,0.75,...,0.5,0.25,0.25,0.25,0.5,0.333333,0.5,0.333333,0.5,0.333333
n00691bec80d3e02,1,train,1.0,0.5,0.5,0.75,0.0,1.0,0.25,1.0,...,0.5,0.5,0.5,0.5,0.666667,0.5,0.5,0.5,0.666667,0.5
n00b8720a2fdc4f2,1,train,1.0,0.75,1.0,1.0,0.0,0.0,1.0,0.5,...,0.5,0.5,0.5,0.5,0.666667,0.5,0.5,0.5,0.666667,0.5


In [6]:
print("number of eras:", len(training_set.era.unique()))
print("number of rows:", len(training_set))

feature_names = [f for f in training_set.columns if "feature_" in f]

training_set = training_set.sample(100000) # subsample to speed up and save memory

number of eras: 574
number of rows: 2412105


In [7]:
# train linear regression model with sklearn as a baseline
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(training_set[feature_names], training_set["target"])

LinearRegression()

In [8]:
# download validation set for testing our models
napi.download_dataset("numerai_validation_data.parquet", "data/numerai_validation_data.parquet")

# load validation set
validation_set = pd.read_parquet("data/numerai_validation_data.parquet")

2022-02-04 15:11:52,239 INFO numerapi.utils: starting download
data/numerai_validation_data.parquet: 228MB [00:44, 5.08MB/s]                           


In [22]:
predictions = model.predict(validation_set[feature_names])

# validation correlation by era
validation_set["prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
)

!pip install --upgrade plotly

import plotly.express as px

# plot era correlations bar graph, each bar is a correlation between prediction and target by era
fig = px.bar(era_correlations)
fig.show()



In [23]:
# cumulative sum of era correlations
# era correlations are used to calculate returns
# taking the cumulative sum of era correlations can estimate the expected returns without compounding

cum_sum = np.cumsum(era_correlations)

fig = px.bar(cum_sum)
fig.show()

data/numerai_training_data.parquet:   0%|          | 0.00/1.01G [32:28<?, ?B/s]


In [24]:
# lets train a catboost model to see if it can beat the linear regression model
# https://catboost.ai/

!pip install --upgrade catboost
import catboost

model = catboost.CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
)
model.fit(training_set[feature_names], training_set["target"])
predictions = model.predict(validation_set[feature_names])

# validation correlation by era
validation_set["prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
    lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
)

!pip install --upgrade plotly

import plotly.express as px
# plot era correlations bar graph

fig = px.bar(era_correlations)
fig.show()


Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m[36m0:00:01[0m
Collecting graphviz
  Downloading graphviz-0.19.1-py3-none-any.whl (46 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.0.4 graphviz-0.19.1
0:	learn: 0.2241412	total: 92.3ms	remaining: 1m 32s
1:	learn: 0.2241362	total: 141ms	remaining: 1m 10s
2:	learn: 0.2241318	total: 188ms	remaining: 1m 2s
3:	learn: 0.2241285	total: 229ms	remaining: 57s
4:	learn: 0.2241240	total: 275ms	remaining: 54.7s
5:	learn: 0.2241190	total: 326ms	remaining: 54s
6:	learn: 0.2241153	total: 376ms	remaining: 53.4s
7:	learn: 0.2241108	total: 421ms	remaining: 52.2s
8:	learn: 0.2241067	total: 

In [25]:
# cumulative sum of era correlations
# we can see that the catboost model is better than the linear regression model
cum_sum = np.cumsum(era_correlations)

fig = px.bar(cum_sum)
fig.show()