In [16]:
import os
import numpy as np
import pandas as pd
import random
from tqdm import tqdm, trange


import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn import MSELoss

from sklearn.metrics import mean_squared_error

from purgedSplit import PurgedGroupTimeSeriesSplit
from earlystopping import EarlyStopping

from Resnet import Resnet

In [3]:
DATA_PATH = 'numerai_dataset_256/numerai_training_data.csv'

In [5]:
device = torch.device('cuda')
EPOCHS = 200
LR = 3e-4

In [7]:
#Read train data
train_data = pd.read_csv(DATA_PATH)

#Dataset is large, lets quick and dirty optimize memory
train_data = train_data.astype({c: np.float32 for c in train_data.select_dtypes(include='float64').columns})

In [8]:
feat_cols = [c for c in train_data.columns if 'feature' in c]

In [9]:
target_cols = ['target']

In [10]:
#Custom dataset - used in JS comp.
class MarketDataset:
    def __init__(self, df):
        self.features = df[feat_cols].values
        self.label = df[target_cols].values.reshape(-1,len(target_cols))
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.features[idx], dtype=torch.float),
            'label': torch.tensor(self.label[idx], dtype=torch.float)
            
        }

In [11]:
#extract era number from Era string
train_data['erano']=train_data.era.str.slice(3).astype(int)

In [24]:
#train and test hold-out
train_set = train_data.query('erano < 100').reset_index(drop=True)
test_set = train_data.query('erano > 100').reset_index(drop=True)

In [18]:
#Seed for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=1111)

In [19]:
#training function
def train_fn(model, optimizer, eras, train_dataset, loss_fn, device):
    model.train()
    final_loss = 0
    
    for era in eras:
        df = train_dataset[train_dataset.era==era]
        X,y = torch.from_numpy(df[feat_cols].values).float().to(device),torch.from_numpy(df[target_cols].values).float().to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = loss_fn(outputs,y)
        loss.backward()
        optimizer.step()
        
        final_loss += loss.item()
    final_loss/=len(eras)
        
    return final_loss
        

In [20]:
#inference function - outputs val_loss and epoch predictions for eval
def inference(model, eras, val_dataset, device,loss_fn=None):
    model.eval()
    preds = []
    val_loss = 0
    for era in eras:
        df = val_dataset[val_dataset.era==era]
        X,y = torch.from_numpy(df[feat_cols].values).float().to(device),torch.from_numpy(df[target_cols].values).float().to(device)
        with torch.no_grad():
            outputs = model(X)
        if loss_fn:
            loss = loss_fn(outputs,y)
        
            val_loss += loss.item()
        
        preds.append(outputs.detach().cpu().numpy())
        
        
        
    preds = np.concatenate(preds).reshape(-1,len(target_cols))
    
    if loss_fn:
        val_loss/=len(eras)
    else:
        val_loss = None
    
    
    
    return val_loss, preds

In [25]:
#purged group time series split - prevents leakage from trainig
#to val sets.
#5 splits "embargo"/group gap of 20. Group by era.
gkf = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=20)
splits = list(gkf.split(train_set['target'],groups=train_set['erano'].values))

In [None]:
#train loop
#todo - refactor this to work for new architecture
for _fold, (tr,te) in enumerate(splits):
    print(f'Fold: {_fold}')
    seed_everything(seed=1111+_fold)
    
    model = Resnet(len(feat_cols),len(target_cols),[0.19856,0.23423,0.15234,0.18923,0.20213])
    model = model.to(device)
    
    
    optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    #optimizer = FTML(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer,base_lr=LR,max_lr=3e-2,cycle_momentum=False)
    loss_fn = nn.MSELoss()
    #loss_fn = SmoothBCEwLogits(smoothing=0.005)
    
    model_weights = f'{CACHE_PATH}online_model_{_fold}_v2.pkl'
    es = EarlyStopping(patience=10,mode='min')
    

    train_dataset = train_set.loc[tr]
    valid_dataset = train_set.loc[te]
    train_eras = train_dataset.era.unique()
    valid_eras = valid_dataset.era.unique()
    np.random.shuffle(train_eras)
    np.random.shuffle(valid_eras)
#     train_dataset = MarketDataset(train_set.loc[tr])
#     valid_dataset = MarketDataset(train_set.loc[te])
#     train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=False)
#     valid_loader = DataLoader(valid_dataset,batch_size=BATCH_SIZE,shuffle=False)
    
    for epoch in (t:=trange(EPOCHS)):
        train_loss = train_fn(model,optimizer,train_eras,train_dataset,loss_fn,device)
        scheduler.step()
        valid_loss, valid_preds = inference(model,valid_eras,valid_dataset,device,loss_fn)
        
        #roc_score = roc_auc_score(train_set.loc[te][target_cols].values,valid_preds)
        
        
        nn.utils.clip_grad_norm_(model.parameters(),5)
        
        es(valid_loss,model,model_path=model_weights)
        if es.early_stop:
            print('Early stopping')
            break
        t.set_description('Train loss {} Valid loss {}'.format(train_loss,valid_loss))
    #torch.save(model.state_dict(),f'{CACHE_PATH}model_{_fold}.pkl')