In [1]:
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset

from numerapi import NumerAPI
from utils import (
    save_model,
    load_model,
    neutralize,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL,
)




In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

Using cpu device


In [3]:
napi = NumerAPI()
current_round = napi.get_current_round()
#Dataset path
version = "v4.1"
feature_set_name = "medium"
train_path = f"{version}/train.parquet"
validation_path = f"{version}/validation.parquet"

# download all the things
napi.download_dataset(f"{version}/train_int8.parquet")
napi.download_dataset(f"{version}/validation_int8.parquet")
napi.download_dataset(
    f"{version}/live_int8.parquet",
    f"{version}/live_int8_{current_round}.parquet",
)

napi.download_dataset(f"{version}/validation_example_preds.parquet")
napi.download_dataset(f"{version}/features.json")

with open(f"{version}/features.json", "r") as f:
    feature_metadata = json.load(f)

# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"][
    feature_set_name
]  # get the medium feature set
target_cols = feature_metadata["targets"]
# read in just those features along with era and target columns
read_columns = features + target_cols + [ERA_COL, DATA_TYPE_COL]

# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet(
    f"{version}/train_int8.parquet", columns=read_columns
)
validation_data = pd.read_parquet(
    f"{version}/validation_int8.parquet", columns=read_columns
)
live_data = pd.read_parquet(f"{version}/live_int8_{current_round}.parquet", columns=read_columns)


# get all the data to possibly use for training
all_data = pd.concat([training_data, validation_data])

# save indices for easier data selection later
training_index = training_data.index
validation_index = validation_data.index
all_index = all_data.index

# delete training and validation data to save space
del training_data
del validation_data
gc.collect()  # clear up memory


2023-07-10 10:24:14,502 INFO numerapi.utils: target file already exists
2023-07-10 10:24:14,503 INFO numerapi.utils: download complete
2023-07-10 10:24:15,681 INFO numerapi.utils: target file already exists
2023-07-10 10:24:15,682 INFO numerapi.utils: download complete
2023-07-10 10:24:16,923 INFO numerapi.utils: target file already exists
2023-07-10 10:24:16,924 INFO numerapi.utils: download complete
2023-07-10 10:24:18,146 INFO numerapi.utils: target file already exists
2023-07-10 10:24:18,147 INFO numerapi.utils: download complete
2023-07-10 10:24:19,226 INFO numerapi.utils: target file already exists
2023-07-10 10:24:19,227 INFO numerapi.utils: download complete


0



In [4]:
# Int8 datatype has pd.NA which don't play nice with models.  We simply fill NA with median values here
print("cleaning up NAs")
all_data[features] = all_data[features].fillna(all_data[features].median(skipna=True))
all_data[features] = all_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!
live_data[features] = live_data[features].fillna(
    all_data[features].median(skipna=True)
)  # since live data is only one era, we need to use the median for all eras
live_data[features] = live_data[features].astype("int8")  # make sure change to float32 if using the non int8 data!
# Alternatively could convert nan columns to be floats and replace pd.NA with np.nan

cleaning up NAs


: 

: 

# Model Stuff

In [None]:
targets = [
    "target_nomi_v4_20",
    "target_jerome_v4_60",
    "target_ralph_v4_20",
    "target_tyler_v4_20",
    "target_victor_v4_20",
    "target_waldo_v4_20",
]
params_name = "lg_lgbm"
params = {
    "n_estimators": 20000,
    "learning_rate": 0.001,
    "max_depth": 6,
    "num_leaves": 2**6,
    "colsample_bytree": 0.1,
}



In [None]:
prediction_cols = []
for target in tqdm(targets):
    prediction_col = f"{params_name}_{version}_{feature_set_name}_{target}"
    train_data_model_name = f"train_data_{prediction_col}"
    print(f"Checking for existing model '{train_data_model_name}'")
    train_model = load_model(train_data_model_name)
    if not train_model:
        print(f"model not found, creating new one")
        train_model = LGBMRegressor(**params)
        # train on all of train and save the model so we don't have to train next time
        target_train_index = (
            all_data.loc[training_index, target].dropna().index
        )  # make sure we only train on rows which have this target
        train_model.fit(
            all_data.loc[target_train_index, features],
            all_data.loc[target_train_index, target],
        )  # in case some of the targets are missing data
        print(f"saving new model: {train_data_model_name}")
        save_model(train_model, train_data_model_name)

    # predict on validation data
    all_data.loc[validation_index, prediction_col] = train_model.predict(
        all_data.loc[validation_index, features]
    )
    gc.collect()

    # do the same thing for all data (for predicting on live)
    all_data_model_name = f"all_data_{prediction_col}"
    print(f"Checking for existing model '{all_data_model_name}'")
    all_data_model = load_model(all_data_model_name)
    if not all_data_model:
        print(f"model not found, creating new one")
        all_data_model = LGBMRegressor(**params)
        all_data_target_index = (
            all_data.loc[all_index, target].dropna().index
        )  # make sure we only train on rows which have this target
        # train on all of train and save the model so we don't have to train next time
        all_data_model.fit(
            all_data.loc[all_data_target_index, features],
            all_data.loc[all_data_target_index, target],
        )
        print(f"saving new model: {all_data_model_name}")
        save_model(all_data_model, all_data_model_name)

    # predict on live data
    live_data[prediction_col] = all_data_model.predict(
        live_data[features].fillna(np.nan)
    )  # filling live data with nans makes us ignore those features if necessary
    gc.collect()

    prediction_cols.append(prediction_col)

In [None]:
def save_only_highest(current_accuracy, epoch):
    model_path = f"../models/MoneyBrinterBRR/acc:{current_accuracy:.2f}| epoch:{epoch}"
    filenames = os.listdir(model_path)
    highest_accuracy = max([float(str.split(filename, ':')[1]) for filename in filenames])

    if current_accuracy > highest_accuracy:
        filepath_with_highest_accuracy = f"../models/dogs-cats/dogs-cats-cnn.pth-accurracy-{highest_accuracy}"
        torch.save(model.state_dict(), model_path)
        os.remove(filepath_with_highest_accuracy) 
        print(f"Saved the highest accuracy model to {model_path}, with accuracy {current_accuracy:.2f}")
        