# Numerai — Dataset download & quick EDA

This notebook downloads the Numerai training dataset (v5.2 by default), optionally caches it locally, and runs a lightweight exploratory pass.

**Sections**
- Setup & configuration
- Data download / load
- Quick EDA



In [20]:
# !pip install numerapi
# !pip install ydata_profiling
# !pip install seaborn
# !pip install pandas
# !pip install NumerAPI
# !pip install pyarrow
# !pip install fastparquet
#!pip install lightgbm
#!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.8.0 threadpoolctl-3.6.0
[0m

In [1]:
from numerapi import NumerAPI
import pandas as pd
import json
from ydata_profiling import ProfileReport
import seaborn as sns
import lightgbm as lgb
import sklearn

In [2]:
# Setup & configuration
DATASET_VERSION = 'v5.2'
napi = NumerAPI()

In [3]:
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

Available versions:
 ['v5.2', 'v5.0', 'v5.1']


In [4]:
current_version_files = [f for f in all_datasets if f.startswith(DATASET_VERSION)]
print("Available", DATASET_VERSION, "files:\n", current_version_files)

Available v5.2 files:
 ['v5.2/features.json', 'v5.2/live.parquet', 'v5.2/live_benchmark_models.parquet', 'v5.2/live_example_preds.csv', 'v5.2/live_example_preds.parquet', 'v5.2/meta_model.parquet', 'v5.2/train.parquet', 'v5.2/train_benchmark_models.parquet', 'v5.2/validation.parquet', 'v5.2/validation_benchmark_models.parquet', 'v5.2/validation_example_preds.csv', 'v5.2/validation_example_preds.parquet']


In [5]:
napi.download_dataset(f'{DATASET_VERSION}/features.json')
feature_metadata = json.load(open(f"{DATASET_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

2026-01-01 08:14:20,664 INFO numerapi.utils: target file already exists
2026-01-01 08:14:20,665 INFO numerapi.utils: download complete


feature_sets 18
targets 41


In [6]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 780
all 2748


In [None]:
napi.download_dataset(f'{DATASET_VERSION}/train.parquet')

In [7]:
wanted_feature_set = "small"
data = pd.read_parquet(
    path='./v5.2/train.parquet',columns=["era", "target"] + feature_sets[wanted_feature_set]
)

In [8]:
display(data.info())
display(data.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 2746268 entries, n0007b5abb0c3a25 to nfffed717119d633
Data columns (total 44 columns):
 #   Column                                         Dtype  
---  ------                                         -----  
 0   era                                            object 
 1   target                                         float32
 2   feature_antistrophic_striate_conscriptionist   int8   
 3   feature_bicameral_showery_wallaba              int8   
 4   feature_bridal_fingered_pensioner              int8   
 5   feature_collectivist_flaxen_gueux              int8   
 6   feature_concurring_fabled_adapter              int8   
 7   feature_crosscut_whilom_ataxy                  int8   
 8   feature_departmental_inimitable_sentencer      int8   
 9   feature_dialectal_homely_cambodia              int8   
 10  feature_donnard_groutier_twinkle               int8   
 11  feature_elusive_vapoury_accomplice             int8   
 12  feature_geminate_crummi

None

Unnamed: 0,target,feature_antistrophic_striate_conscriptionist,feature_bicameral_showery_wallaba,feature_bridal_fingered_pensioner,feature_collectivist_flaxen_gueux,feature_concurring_fabled_adapter,feature_crosscut_whilom_ataxy,feature_departmental_inimitable_sentencer,feature_dialectal_homely_cambodia,feature_donnard_groutier_twinkle,...,feature_tridactyl_immoral_snorting,feature_trimeter_soggy_greatest,feature_unanalyzable_excusable_whirlwind,feature_unbreakable_constraining_hegelianism,feature_unformed_bent_smatch,feature_unministerial_unextenuated_teleostean,feature_unmodish_zymogenic_rousing,feature_unsystematized_subcardinal_malaysia,feature_willful_sere_chronobiology,feature_zoological_peristomial_scute
count,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,...,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0,2746268.0
mean,0.4999478,1.999918,1.999949,1.999928,1.999924,1.999928,1.999914,1.999913,1.999928,1.999928,...,1.999914,1.999913,1.999913,1.999913,1.999915,1.999928,1.999924,1.999915,1.999916,1.999918
std,0.2236927,1.402037,1.096288,1.314663,1.374102,1.320679,1.414359,1.414359,1.320679,1.320679,...,1.414359,1.414359,1.414359,1.414359,1.414359,1.320679,1.374102,1.41436,1.414359,1.402037
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,0.5,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,0.5,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
max,1.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [12]:
model = lgb.LGBMRegressor(
  n_estimators=2000,
  learning_rate=0.01,
  max_depth=5,
  num_leaves=2**5-1,
  colsample_bytree=0.1
)

In [13]:
feature_set = feature_sets["small"]

In [14]:
model.fit(
  data[feature_set],
  data["target"]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 2746268, number of used features: 42
[LightGBM] [Info] Start training from score 0.499948


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.01
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001
