In [55]:
from dotenv import dotenv_values
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset

from numerapi import NumerAPI 
from utils import (
    save_model,
    load_model,
    neutralize,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL,
)

# Authenticate





In [56]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

Using cpu device


In [57]:
env_vars = dotenv_values('.env')
napi = NumerAPI(env_vars['PUBLIC_ID'], env_vars['PRIVATE_KEY'])
current_round = napi.get_current_round()
#Dataset path
version = "v4.1"
feature_set_name = "medium"
train_path = f"{version}/train.parquet"
validation_path = f"{version}/validation.parquet"



In [58]:
napi.download_dataset(f"{version}/train.parquet")
napi.download_dataset(f"{version}/validation.parquet")
napi.download_dataset(f"{version}/live.parquet", f"{version}/live_{current_round}.parquet")

napi.download_dataset(f"{version}/validation_example_preds.parquet")
napi.download_dataset(f"{version}/features.json")

2023-07-15 16:13:43,898 INFO numerapi.utils: target file already exists
2023-07-15 16:13:43,899 INFO numerapi.utils: download complete
2023-07-15 16:13:45,075 INFO numerapi.utils: target file already exists
2023-07-15 16:13:45,075 INFO numerapi.utils: resuming download

v4.1/validation.parquet:   0%|          | 0.00/1.59G [00:00<?, ?B/s]
v4.1/validation.parquet: 1.64GB [00:15, 9.61GB/s]                            
v4.1/validation.parquet:   5%|▌         | 85.6M/1.59G [00:02<01:03, 23.8MB/s]
v4.1/validation.parquet:   6%|▌         | 99.3M/1.59G [00:03<01:08, 21.7MB/s]
v4.1/validation.parquet:   7%|▋         | 108M/1.59G [00:04<01:13, 20.2MB/s] 
v4.1/validation.parquet:   7%|▋         | 113M/1.59G [00:04<01:16, 19.4MB/s]
v4.1/validation.parquet:   7%|▋         | 117M/1.59G [00:04<01:16, 19.2MB/s]
v4.1/validation.parquet:   8%|▊         | 121M/1.59G [00:05<01:18, 18.8MB/s]
v4.1/validation.parquet:   8%|▊         | 123M/1.59G [00:05<01:19, 18.5MB/s]
v4.1/validation.parquet:   8%|▊         

'v4.1/features.json'



In [59]:
with open(f"{version}/features.json", "r") as f:
    feature_metadata = json.load(f)

features = feature_metadata["feature_sets"][
    feature_set_name
] 
target_cols = feature_metadata["targets"]
read_columns = features + target_cols + [ERA_COL, DATA_TYPE_COL]



In [60]:
training_data = pd.read_parquet(
    f"{version}/train.parquet", columns=read_columns, engine='fastparquet'
)
validation_data = pd.read_parquet(
    f"{version}/validation.parquet", columns=read_columns, engine='fastparquet'
)
live_data = pd.read_parquet(f"{version}/live_{current_round}.parquet", columns=read_columns, engine='fastparquet'
)



For Final model comment this line

In [61]:
# reduce the number of eras to every 4th era to speed things up... uncomment these lines to speed things up.
every_4th_era = training_data[ERA_COL].unique()[::4]
training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
every_4th_era = validation_data[ERA_COL].unique()[::4]
validation_data = validation_data[validation_data[ERA_COL].isin(every_4th_era)]



In [62]:
# get all the data to possibly use for training
all_data = pd.concat([training_data, validation_data])
all_data[features] = all_data[features].fillna(all_data[features].median(skipna=True))
live_data[features] = live_data[features].fillna(
    all_data[features].median(skipna=True)
) # use the training data median to fill in live data missing values
training_index = training_data.index
validation_index = validation_data.index
all_index = all_data.index

v4.1/validation.parquet: 1.64GB [02:18, 11.9MB/s]




In [63]:
print(f"all_data\n\n")
print(f"shape: {all_data.shape}\n {all_data.columns}\n {all_data.describe}")

all_data


shape: (1243809, 680)
 Index(['feature_abating_unadaptable_weakfish',
       'feature_ablest_mauritanian_elding',
       'feature_acclimatisable_unfeigned_maghreb',
       'feature_accommodable_crinite_cleft',
       'feature_accretive_sorrier_skedaddle',
       'feature_acetose_periotic_coronation',
       'feature_additive_untrustworthy_hierologist',
       'feature_adsorbed_blizzardy_burlesque',
       'feature_affettuoso_taxidermic_greg', 'feature_afoul_valvate_faery',
       ...
       'target_cyrus_v4_20', 'target_cyrus_v4_60', 'target_caroline_v4_20',
       'target_caroline_v4_60', 'target_sam_v4_20', 'target_sam_v4_60',
       'target_xerxes_v4_20', 'target_xerxes_v4_60', 'era', 'data_type'],
      dtype='object', length=680)
 <bound method NDFrame.describe of                   feature_abating_unadaptable_weakfish  \
id                                                       
n003bba8a98662e4                                  0.00   
n003bee128c2fcfc                   

In [64]:
# delete training and validation data to save space
del training_data
del validation_data
gc.collect()  # clear up memory

0



# Model Stuff