# Numerai — Dataset download & quick EDA

This notebook downloads the Numerai training dataset (v5.2 by default), optionally caches it locally, and runs a lightweight exploratory pass.

**Sections**
- Setup & configuration
- Data download / load
- Quick EDA



In [None]:
# !pip install numerapi
# !pip install ydata_profiling
# !pip install seaborn
# !pip install pandas
# !pip install NumerAPI
# !pip install pyarrow
# !pip install fastparquet
# !pip install lightgbm
# !pip install scikit-learn

In [2]:
from numerapi import NumerAPI
import pandas as pd
import json
from ydata_profiling import ProfileReport
import seaborn as sns
import lightgbm as lgb
import sklearn
import numerblox
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from numerblox.meta import CrossValEstimator, make_meta_pipeline
from numerblox.ensemble import NumeraiEnsemble, PredictionReducer
from numerblox.download import NumeraiClassicDownloader
from numerblox.numerframe import create_numerframe
from numerblox.neutralizers import FeatureNeutralizer


In [3]:
# Setup & configuration
napi = NumerAPI()

In [4]:
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
dataset_versions = sorted(dataset_versions, reverse=True)
print("Available versions:\n", dataset_versions)

Available versions:
 ['v5.2', 'v5.1', 'v5.0']


In [5]:
DATASET_VERSION = dataset_versions[0]

In [6]:
current_version_files = [f for f in all_datasets if f.startswith(DATASET_VERSION)]
print("Available", DATASET_VERSION, "files:\n", current_version_files)

Available v5.2 files:
 ['v5.2/features.json', 'v5.2/live.parquet', 'v5.2/live_benchmark_models.parquet', 'v5.2/live_example_preds.csv', 'v5.2/live_example_preds.parquet', 'v5.2/meta_model.parquet', 'v5.2/train.parquet', 'v5.2/train_benchmark_models.parquet', 'v5.2/validation.parquet', 'v5.2/validation_benchmark_models.parquet', 'v5.2/validation_example_preds.csv', 'v5.2/validation_example_preds.parquet']


In [7]:
#napi.download_dataset(f'{DATASET_VERSION}/features.json')
feature_metadata = json.load(open('v5.2/features.json'))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

feature_sets 18
targets 41


In [8]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 780
all 2748


In [None]:
# downloader = NumeraiClassicDownloader('v5.2')
# #downloader.download_training_data("train_val", version=DATASET_VERSION)

In [9]:
data = create_numerframe('v5.2/train.parquet', columns=["era", "target"] + feature_sets['small'])

In [None]:
# napi.download_dataset(f'{DATASET_VERSION}/train.parquet')
# data = pd.read_parquet(
#     path='./v5.2/train.parquet',columns=["era", "target"] + feature_sets['small']
# )

In [10]:
data

Unnamed: 0_level_0,era,target,feature_antistrophic_striate_conscriptionist,feature_bicameral_showery_wallaba,feature_bridal_fingered_pensioner,feature_collectivist_flaxen_gueux,feature_concurring_fabled_adapter,feature_crosscut_whilom_ataxy,feature_departmental_inimitable_sentencer,feature_dialectal_homely_cambodia,...,feature_tridactyl_immoral_snorting,feature_trimeter_soggy_greatest,feature_unanalyzable_excusable_whirlwind,feature_unbreakable_constraining_hegelianism,feature_unformed_bent_smatch,feature_unministerial_unextenuated_teleostean,feature_unmodish_zymogenic_rousing,feature_unsystematized_subcardinal_malaysia,feature_willful_sere_chronobiology,feature_zoological_peristomial_scute
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n0007b5abb0c3a25,0001,0.25,2,2,2,2,2,0,1,2,...,4,1,1,3,0,2,2,3,3,2
n003bba8a98662e4,0001,0.50,2,2,2,2,2,1,4,2,...,4,2,0,0,0,2,2,4,4,2
n003bee128c2fcfc,0001,1.00,2,2,2,2,2,2,2,2,...,3,1,1,0,1,2,2,0,3,2
n0048ac83aff7194,0001,0.25,2,2,2,2,2,1,4,2,...,1,3,4,1,2,2,2,2,0,2
n0055a2401ba6480,0001,0.50,2,2,2,2,2,0,0,2,...,1,0,1,0,0,2,2,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcc1dbdf2212e6,0574,0.50,1,3,0,0,0,4,3,1,...,2,3,4,2,4,3,2,3,3,3
nffde3b371d67394,0574,0.50,2,3,4,4,2,2,4,2,...,2,4,4,3,4,2,1,4,4,2
nfff1a1111b35e84,0574,0.50,0,0,2,1,4,0,0,4,...,4,0,0,0,0,4,4,0,0,2
nfff2bd38e397265,0574,0.75,2,4,0,4,0,4,4,4,...,1,4,4,0,4,1,2,4,1,4


In [11]:
nan_rows = data[data['target'].isna()]
print(nan_rows)

                   era  target  feature_antistrophic_striate_conscriptionist  \
id                                                                             
n1b7cb7d2e61e62f  0001     NaN                                             2   
n1bbddbf04570ab8  0001     NaN                                             2   

                  feature_bicameral_showery_wallaba  \
id                                                    
n1b7cb7d2e61e62f                                  2   
n1bbddbf04570ab8                                  2   

                  feature_bridal_fingered_pensioner  \
id                                                    
n1b7cb7d2e61e62f                                  2   
n1bbddbf04570ab8                                  2   

                  feature_collectivist_flaxen_gueux  \
id                                                    
n1b7cb7d2e61e62f                                  2   
n1bbddbf04570ab8                                  2   

                

In [12]:
data['target'].fillna(data['target'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['target'].fillna(data['target'].mode()[0], inplace=True)


In [13]:
model = XGBClassifier()
crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=5), predict_func='predict_proba')
pred_rud = PredictionReducer(n_models=5, n_classes=5)
ens = NumeraiEnsemble(donate_weighted=True)
neut = FeatureNeutralizer(proportion=0.5)
full_pipe = make_meta_pipeline(crossval, pred_rud, ens, neut)

In [14]:
X, y = data.get_feature_target_pair(multi_target=False)
y_int = (y * 4).astype(int)  # Convert targets to integer classes for classification
era_series = data.get_era_data
features = data.get_feature_data
full_pipe.fit(X, y_int, era_series=era_series)

: 

In [None]:
model = lgb.LGBMRegressor(
  n_estimators=2000,
  learning_rate=0.01,
  max_depth=5,
  num_leaves=2**5-1,
  colsample_bytree=0.1
)

In [None]:
feature_set = feature_sets["small"]

In [None]:
model.fit(
  data[feature_set],
  data["target"]
)