In [1]:
from dotenv import dotenv_values
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset

from numerapi import NumerAPI 
from utils import (
    save_model,
    load_model,
    neutralize,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL,
)

# Authenticate





In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

Using cuda device


In [3]:
env_vars = dotenv_values('.env')
napi = NumerAPI(env_vars['PUBLIC_ID'], env_vars['PRIVATE_KEY'])
current_round = napi.get_current_round()
#Dataset path
version = "v4.1"
feature_set_name = "medium"
train_path = f"{version}/train.parquet"
validation_path = f"{version}/validation.parquet"



In [4]:
napi.download_dataset(f"{version}/train.parquet")
napi.download_dataset(f"{version}/validation.parquet")
napi.download_dataset(f"{version}/live.parquet", f"{version}/live_{current_round}.parquet")

napi.download_dataset(f"{version}/validation_example_preds.parquet")
napi.download_dataset(f"{version}/features.json")

2023-07-18 10:01:22,822 INFO numerapi.utils: target file already exists
2023-07-18 10:01:22,822 INFO numerapi.utils: download complete
2023-07-18 10:01:24,058 INFO numerapi.utils: target file already exists
2023-07-18 10:01:24,058 INFO numerapi.utils: download complete
2023-07-18 10:01:25,384 INFO numerapi.utils: target file already exists
2023-07-18 10:01:25,384 INFO numerapi.utils: download complete
2023-07-18 10:01:26,518 INFO numerapi.utils: target file already exists
2023-07-18 10:01:26,519 INFO numerapi.utils: download complete
2023-07-18 10:01:27,715 INFO numerapi.utils: target file already exists
2023-07-18 10:01:27,716 INFO numerapi.utils: download complete


'v4.1/features.json'



In [5]:
with open(f"{version}/features.json", "r") as f:
    feature_metadata = json.load(f)

features = feature_metadata["feature_sets"][
    feature_set_name
] 
target_cols = feature_metadata["targets"]
read_columns = features + target_cols + [ERA_COL, DATA_TYPE_COL]



In [6]:
training_data = pd.read_parquet(
    f"{version}/train.parquet", columns=read_columns, engine='fastparquet'
)
validation_data = pd.read_parquet(
    f"{version}/validation.parquet", columns=read_columns, engine='fastparquet'
)
live_data = pd.read_parquet(f"{version}/live_{current_round}.parquet", columns=read_columns, engine='fastparquet'
)



For Final model comment this line

In [7]:
# reduce the number of eras to every 4th era to speed things up... uncomment these lines to speed things up.
every_4th_era = training_data[ERA_COL].unique()[::4]
training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
every_4th_era = validation_data[ERA_COL].unique()[::4]
validation_data = validation_data[validation_data[ERA_COL].isin(every_4th_era)]



In [8]:
# get all the data to possibly use for training
all_data = pd.concat([training_data, validation_data])
all_data[features] = all_data[features].fillna(all_data[features].median(skipna=True))
live_data[features] = live_data[features].fillna(
    all_data[features].median(skipna=True)
) # use the training data median to fill in live data missing values
training_index = training_data.index
validation_index = validation_data.index
all_index = all_data.index



In [9]:
print(f"all_data\n\n")
print(f"shape: {all_data.shape}\n {all_data.columns}\n {all_data.describe}")

all_data


shape: (1243809, 680)
 Index(['feature_abating_unadaptable_weakfish',
       'feature_ablest_mauritanian_elding',
       'feature_acclimatisable_unfeigned_maghreb',
       'feature_accommodable_crinite_cleft',
       'feature_accretive_sorrier_skedaddle',
       'feature_acetose_periotic_coronation',
       'feature_additive_untrustworthy_hierologist',
       'feature_adsorbed_blizzardy_burlesque',
       'feature_affettuoso_taxidermic_greg', 'feature_afoul_valvate_faery',
       ...
       'target_cyrus_v4_20', 'target_cyrus_v4_60', 'target_caroline_v4_20',
       'target_caroline_v4_60', 'target_sam_v4_20', 'target_sam_v4_60',
       'target_xerxes_v4_20', 'target_xerxes_v4_60', 'era', 'data_type'],
      dtype='object', length=680)
 <bound method NDFrame.describe of                   feature_abating_unadaptable_weakfish  \
id                                                       
n003bba8a98662e4                                  0.00   
n003bee128c2fcfc                   

In [10]:
# delete training and validation data to save space
del training_data
del validation_data
gc.collect()  # clear up memory

0



# Model Stuff

In [11]:
param_id = "lg_lgbm"
params = {
    "n_estimators": 20000,
    "learning_rate": 0.001,
    "max_depth": 6,
    "num_leaves": 2**6,
    "colsample_bytree": 0.1,
    "objective": "regression",
}



In [12]:
feature = []
target_cols = []
prediction_cols = []
for col in all_data:
    if col.startswith("feature"):
        feature.append(col)
    elif col.startswith("target"):
        target_cols.append(col)

print(target_cols)

for target in tqdm(target_cols):
    prediction_col = f"{param_id}_{version}_{feature_set_name}_{target}"
    train_data_model_name = f"train_data_{prediction_col}"
    print(f"Checking for existing model '{train_data_model_name}'")
    train_model = load_model(train_data_model_name)
    if not train_model:
        print(f"Training model '{train_data_model_name}'")
        train_model = LGBMRegressor(**params)
        train_model.fit(
            all_data.loc[training_index, feature],
            all_data.loc[training_index, target],
            eval_set=[(all_data.loc[validation_index, feature], all_data.loc[validation_index, target])],
            early_stopping_rounds=200,
            verbose=100,
        )
        print(f"saving new model: {train_data_model_name}")
        save_model(train_model, train_data_model_name)

    all_data_model_name = f"all_data_{prediction_col}"
    print(f"Checking for existing model '{all_data_model_name}'")
    all_data_model = load_model(all_data_model_name)
    if not all_data_model:
        print(f"model not found, creating new one")
        all_data_model = LGBMRegressor(**params)
        all_data_model.fit(
            all_data.loc[all_index, features],
            all_data.loc[all_index, target],
        )
        print(f"saving new model: {all_data_model_name}")
        save_model(all_data_model, all_data_model_name)
    
        # predict on live data
    live_data[prediction_col] = all_data_model.predict(
        live_data[features].fillna(np.nan)
    )  # filling live data with nans makes us ignore those features if necessary
    gc.collect()

    prediction_cols.append(prediction_col)

['target', 'target_nomi_v4_20', 'target_nomi_v4_60', 'target_tyler_v4_20', 'target_tyler_v4_60', 'target_victor_v4_20', 'target_victor_v4_60', 'target_ralph_v4_20', 'target_ralph_v4_60', 'target_waldo_v4_20', 'target_waldo_v4_60', 'target_jerome_v4_20', 'target_jerome_v4_60', 'target_janet_v4_20', 'target_janet_v4_60', 'target_ben_v4_20', 'target_ben_v4_60', 'target_alan_v4_20', 'target_alan_v4_60', 'target_paul_v4_20', 'target_paul_v4_60', 'target_george_v4_20', 'target_george_v4_60', 'target_william_v4_20', 'target_william_v4_60', 'target_arthur_v4_20', 'target_arthur_v4_60', 'target_thomas_v4_20', 'target_thomas_v4_60', 'target_cyrus_v4_20', 'target_cyrus_v4_60', 'target_caroline_v4_20', 'target_caroline_v4_60', 'target_sam_v4_20', 'target_sam_v4_60', 'target_xerxes_v4_20', 'target_xerxes_v4_60']


  0%|          | 0/37 [00:00<?, ?it/s]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target'


  live_data[prediction_col] = all_data_model.predict(
  3%|▎         | 1/37 [00:02<01:44,  2.89s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_nomi_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_nomi_v4_20'


  live_data[prediction_col] = all_data_model.predict(
  5%|▌         | 2/37 [00:05<01:40,  2.87s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_nomi_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_nomi_v4_60'


  live_data[prediction_col] = all_data_model.predict(
  8%|▊         | 3/37 [00:08<01:41,  2.98s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_tyler_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_tyler_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 11%|█         | 4/37 [00:10<01:26,  2.63s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_tyler_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_tyler_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 14%|█▎        | 5/37 [00:12<01:17,  2.41s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_victor_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_victor_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 16%|█▌        | 6/37 [00:15<01:11,  2.31s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_victor_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_victor_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 19%|█▉        | 7/37 [00:18<01:23,  2.79s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_ralph_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_ralph_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 22%|██▏       | 8/37 [00:22<01:25,  2.94s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_ralph_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_ralph_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 24%|██▍       | 9/37 [00:25<01:29,  3.18s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_waldo_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_waldo_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 27%|██▋       | 10/37 [00:27<01:16,  2.82s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_waldo_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_waldo_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 30%|██▉       | 11/37 [00:29<01:06,  2.56s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_jerome_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_jerome_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 32%|███▏      | 12/37 [00:32<01:01,  2.47s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_jerome_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_jerome_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 35%|███▌      | 13/37 [00:34<00:56,  2.34s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_janet_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_janet_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 38%|███▊      | 14/37 [00:37<01:01,  2.68s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_janet_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_janet_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 41%|████      | 15/37 [00:40<01:02,  2.86s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_ben_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_ben_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 43%|████▎     | 16/37 [00:43<01:00,  2.90s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_ben_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_ben_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 46%|████▌     | 17/37 [00:45<00:53,  2.67s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_alan_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_alan_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 49%|████▊     | 18/37 [00:48<00:48,  2.53s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_alan_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_alan_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 51%|█████▏    | 19/37 [00:50<00:44,  2.45s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_paul_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_paul_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 54%|█████▍    | 20/37 [00:52<00:41,  2.42s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_paul_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_paul_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 57%|█████▋    | 21/37 [00:55<00:37,  2.37s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_george_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_george_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 59%|█████▉    | 22/37 [00:57<00:34,  2.29s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_george_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_george_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 62%|██████▏   | 23/37 [00:59<00:32,  2.30s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_william_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_william_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 65%|██████▍   | 24/37 [01:01<00:29,  2.27s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_william_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_william_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 68%|██████▊   | 25/37 [01:03<00:27,  2.26s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_arthur_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_arthur_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 70%|███████   | 26/37 [01:07<00:27,  2.53s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_arthur_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_arthur_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 73%|███████▎  | 27/37 [01:09<00:24,  2.45s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_thomas_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_thomas_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 76%|███████▌  | 28/37 [01:11<00:21,  2.36s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_thomas_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_thomas_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 78%|███████▊  | 29/37 [01:14<00:19,  2.47s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_cyrus_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_cyrus_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 81%|████████  | 30/37 [01:18<00:20,  2.87s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_cyrus_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_cyrus_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 84%|████████▍ | 31/37 [01:20<00:16,  2.67s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_caroline_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_caroline_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 86%|████████▋ | 32/37 [01:22<00:12,  2.46s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_caroline_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_caroline_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 89%|████████▉ | 33/37 [01:24<00:09,  2.33s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_sam_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_sam_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 92%|█████████▏| 34/37 [01:26<00:06,  2.21s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_sam_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_sam_v4_60'


  live_data[prediction_col] = all_data_model.predict(
 95%|█████████▍| 35/37 [01:28<00:04,  2.13s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_xerxes_v4_20'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_xerxes_v4_20'


  live_data[prediction_col] = all_data_model.predict(
 97%|█████████▋| 36/37 [01:30<00:02,  2.08s/it]

Checking for existing model 'train_data_lg_lgbm_v4.1_medium_target_xerxes_v4_60'
Checking for existing model 'all_data_lg_lgbm_v4.1_medium_target_xerxes_v4_60'


  live_data[prediction_col] = all_data_model.predict(
100%|██████████| 37/37 [01:31<00:00,  2.49s/it]




In [13]:
# make an ensemble -- Aggregating all models
print(all_data.columns)
all_data.loc[:, "equal_weight"] = all_data[prediction_cols].mean(axis=1)
live_data["equal_weight"] = live_data[prediction_cols].mean(axis=1)

prediction_cols.append("equal_weight")

#50% neutralized ensemble - So we can reduce dependency on certain features - Reducing overfitting
all_data["half_neutral_equal_weight"] = neutralize(
    df=all_data.loc[validation_index, :],
    columns=[f"equal_weight"],
    neutralizers=features,
    proportion=0.5,
    normalize=True,
    era_col=ERA_COL,
    verbose=True,
)
# do the same for live data
live_data["half_neutral_equal_weight"] = neutralize(
    df=live_data,
    columns=[f"equal_weight"],
    neutralizers=features,
    proportion=0.5,
    normalize=True,
    era_col=ERA_COL,
    verbose=True,
)

Index(['feature_abating_unadaptable_weakfish',
       'feature_ablest_mauritanian_elding',
       'feature_acclimatisable_unfeigned_maghreb',
       'feature_accommodable_crinite_cleft',
       'feature_accretive_sorrier_skedaddle',
       'feature_acetose_periotic_coronation',
       'feature_additive_untrustworthy_hierologist',
       'feature_adsorbed_blizzardy_burlesque',
       'feature_affettuoso_taxidermic_greg', 'feature_afoul_valvate_faery',
       ...
       'target_cyrus_v4_20', 'target_cyrus_v4_60', 'target_caroline_v4_20',
       'target_caroline_v4_60', 'target_sam_v4_20', 'target_sam_v4_60',
       'target_xerxes_v4_20', 'target_xerxes_v4_60', 'era', 'data_type'],
      dtype='object', length=680)


KeyError: "None of [Index(['lg_lgbm_v4.1_medium_target', 'lg_lgbm_v4.1_medium_target_nomi_v4_20',\n       'lg_lgbm_v4.1_medium_target_nomi_v4_60',\n       'lg_lgbm_v4.1_medium_target_tyler_v4_20',\n       'lg_lgbm_v4.1_medium_target_tyler_v4_60',\n       'lg_lgbm_v4.1_medium_target_victor_v4_20',\n       'lg_lgbm_v4.1_medium_target_victor_v4_60',\n       'lg_lgbm_v4.1_medium_target_ralph_v4_20',\n       'lg_lgbm_v4.1_medium_target_ralph_v4_60',\n       'lg_lgbm_v4.1_medium_target_waldo_v4_20',\n       'lg_lgbm_v4.1_medium_target_waldo_v4_60',\n       'lg_lgbm_v4.1_medium_target_jerome_v4_20',\n       'lg_lgbm_v4.1_medium_target_jerome_v4_60',\n       'lg_lgbm_v4.1_medium_target_janet_v4_20',\n       'lg_lgbm_v4.1_medium_target_janet_v4_60',\n       'lg_lgbm_v4.1_medium_target_ben_v4_20',\n       'lg_lgbm_v4.1_medium_target_ben_v4_60',\n       'lg_lgbm_v4.1_medium_target_alan_v4_20',\n       'lg_lgbm_v4.1_medium_target_alan_v4_60',\n       'lg_lgbm_v4.1_medium_target_paul_v4_20',\n       'lg_lgbm_v4.1_medium_target_paul_v4_60',\n       'lg_lgbm_v4.1_medium_target_george_v4_20',\n       'lg_lgbm_v4.1_medium_target_george_v4_60',\n       'lg_lgbm_v4.1_medium_target_william_v4_20',\n       'lg_lgbm_v4.1_medium_target_william_v4_60',\n       'lg_lgbm_v4.1_medium_target_arthur_v4_20',\n       'lg_lgbm_v4.1_medium_target_arthur_v4_60',\n       'lg_lgbm_v4.1_medium_target_thomas_v4_20',\n       'lg_lgbm_v4.1_medium_target_thomas_v4_60',\n       'lg_lgbm_v4.1_medium_target_cyrus_v4_20',\n       'lg_lgbm_v4.1_medium_target_cyrus_v4_60',\n       'lg_lgbm_v4.1_medium_target_caroline_v4_20',\n       'lg_lgbm_v4.1_medium_target_caroline_v4_60',\n       'lg_lgbm_v4.1_medium_target_sam_v4_20',\n       'lg_lgbm_v4.1_medium_target_sam_v4_60',\n       'lg_lgbm_v4.1_medium_target_xerxes_v4_20',\n       'lg_lgbm_v4.1_medium_target_xerxes_v4_60'],\n      dtype='object')] are in the [columns]"



In [None]:
prediction_cols.append("half_neutral_equal_weight")

model_to_submit = f"half_neutral_equal_weight"
# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
all_data.loc[validation_index, "prediction"] = all_data.loc[
    validation_index, model_to_submit
].rank(pct=True)
live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
all_data.loc[validation_index, "prediction"].to_csv(
    f"validation_predictions_{current_round}.csv"
)
live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

validation_example_preds = pd.read_parquet(
    f"{version}/validation_example_preds.parquet"
)
all_data.loc[validation_index, EXAMPLE_PREDS_COL] = validation_example_preds[
    "prediction"
]
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(
    all_data.loc[validation_index, :],
    prediction_cols,
    example_col=EXAMPLE_PREDS_COL,
    fast_mode=True,
    target_col=TARGET_COL,
)
print(validation_stats[["mean", "sharpe"]].to_markdown())



In [None]:
print(
    f"""
Done! Next steps:
    1. Go to numer.ai/tournament (make sure you have an account)
    2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
    3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
"""
)