## Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, gc, random, time

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.model_selection import TimeSeriesSplit

from scipy.special import expit

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from catalyst import dl
from catalyst.callbacks.metrics import AUCCallback
from catalyst.contrib.nn.schedulers import ReduceLROnPlateau
from catalyst.utils.misc import set_global_seed
from catalyst.utils.tracing import load_traced_model


## Init params

In [2]:
seed        = 42
device      = 'cuda:0'
num_epochs  = 20
lr          = 0.005
lr_patience = 5
lr_factor   = 0.4
es_patience = 11
logdir      = './logdir/'
batch_size  = 8192

set_global_seed(seed)

## Prepare data

In [3]:
data = pd.read_csv('data/train.csv')
data = data.sort_values('ts_id')

data = data.query('weight > 0')
data = data.reset_index(drop=True)

fillna = data.min() - 1
data   = data.fillna(fillna)

data['target'] = expit(data['resp']*500)

test = data.iloc[-len(data)//20:].reset_index(drop=True)
data = data.iloc[:-len(data)//20].reset_index(drop=True)

In [4]:
features = [f'feature_{i}' for i in range(130)]

## Dataset

In [5]:
class MarketDataset:
    def __init__(self, df, features, target=None):
        self.features = df[features].values
        self.target = target
        
        if self.target is not None:
            self.target = df[self.target].values

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        data = dict(
            features = torch.FloatTensor(self.features[idx])
        )
        if self.target is not None:
            data['targets'] = torch.FloatTensor([self.target[idx]])
            data['labels'] = torch.FloatTensor([self.target[idx] > 0.5])
            
        return data

## Loaders

In [6]:
def get_loaders(dataframes, batch_size=batch_size):
    datasets = {
        key: MarketDataset(df, features, target='target')
        for key, df in dataframes.items()
    }
    loaders = dict()
    for key, value in datasets.items():
        shuffle = False
        if 'train' == key:
            shuffle = True
            
        loaders[key] = DataLoader(value, batch_size=batch_size, shuffle=shuffle)
    
    return loaders

## Model

In [7]:
class ResLinear(nn.Module):
    def __init__(
        self,
        in_features,
        out_features,
        p = 0.3,
    ):
        super().__init__()
        self.linear1 = nn.Sequential(
            nn.BatchNorm1d(in_features),
            nn.Dropout(p),
            nn.Linear(in_features, in_features),
            nn.LeakyReLU(),
        )
        self.linear2 = nn.Sequential(
            nn.BatchNorm1d(in_features),
            nn.Dropout(p),
            nn.Linear(in_features, out_features),
            nn.LeakyReLU(),
        )
    
    def forward(self, x):
        return self.linear2(x + self.linear1(x))


class SubModel(nn.Module):
    def __init__(
        self,
        features_size,
        hidden_size  = 64,
        dropout_rate = 0.3,
        n_layers     = 4,
    ):
        super().__init__()
        layers = []
        
        now_features = features_size
        new_features = hidden_size
        
        for _ in range(n_layers):
            layers.append(ResLinear(now_features, new_features))
            
            now_features = new_features
            new_features = max(8, new_features // 2)
        
        layers.append(nn.Linear(now_features, 1))
            
        self.layers = nn.Sequential(*layers)


    def forward(self, x):
        return self.layers(x)
    
    
class Model(nn.Module):
    def __init__(
        self,
        params,
    ):
        super().__init__()
        submodels = []
        for p in params:
            submodels.append(SubModel(**p))
        
        self.submodels = nn.ModuleList(submodels)
        self.linear    = nn.Linear(len(params), 1)
        
    def forward(self, x):
        result = {
            f'logits_{i}': submodel(x)
            for i, submodel in enumerate(self.submodels)
        }
        x = torch.cat([v for v in result.values()], dim=1)
        result['logits'] = self.linear(x)
        return result

class ModelTraced(Model):
    def forward(self, x):
        result = [submodel(x) for submodel in self.submodels]
        x = torch.cat(result, dim=1)
        return self.linear(x)

## Train

In [8]:
def get_callbacks(params):
    metrics = {
        f'loss_{i}':1/(2*len(params))
        for i in range(len(params))
    }
    metrics.update({'loss_last':1/2})
    
    callbacks = [
        dl.EarlyStoppingCallback(
            patience = es_patience,
            metric   = 'auc',
            minimize = False,
        ),
        AUCCallback(input_key='labels'),
    ]

    callbacks.extend([
        dl.CriterionCallback(
            input_key     = 'targets',
            output_key    = f'logits_{i}',
            criterion_key = 'bce',
            prefix        = f'loss_{i}',
        )
        for i in range(len(params))
    ])

    callbacks.extend([
        dl.CriterionCallback(
            input_key     = 'targets',
            output_key    = 'logits',
            criterion_key = 'bce',
            prefix        = 'loss_last',
        ),
        dl.MetricAggregationCallback(
            prefix  = "loss",
            metrics = metrics,
            mode    = 'weighted_sum',
        ),
    ])
    return callbacks

def get_cosc(model, params):
    criterion = dict(
        bce = torch.nn.BCEWithLogitsLoss(),
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(
        optimizer = optimizer,
        mode      = 'max',
        patience  = lr_patience,
        factor    = lr_factor,
    )
    
    callbacks = get_callbacks(params)
    return criterion, optimizer, scheduler, callbacks

def train_model(params, dataframes, postfix_logdir):
    model        = Model(params)
    loaders      = get_loaders(dataframes)
    
    criterion, optimizer, scheduler, callbacks = get_cosc(model, params)
    
    runner = dl.SupervisedRunner(device='cuda:0', output_key=None)
    try:
        runner.train(
            model            = model,
            criterion        = criterion,
            optimizer        = optimizer,
            scheduler        = scheduler,
            loaders          = loaders,
            logdir           = logdir + postfix_logdir,
            num_epochs       = num_epochs,
            callbacks        = callbacks,
            main_metric      = 'auc',
            minimize_metric  = False,
            load_best_on_end = True,
            verbose          = False,
        )
    except KeyboardInterrupt:
        pass
    
    return model

## Score

In [9]:
def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

def scoring(dataframe, pred):
    date   = dataframe['date'].values
    weight = dataframe['weight'].values
    resp   = dataframe['resp'].values
    
    return utility_score_bincount(
            date,
            weight,
            resp,
            np.where(pred >= 0.5, 1, 0).astype(int)
        )

def score(model, dataframe):
    loader = get_loaders({'valid':dataframe})['valid']
    
    runner = dl.SupervisedRunner(device='cuda:0', output_key=None)
    predict = runner.predict_loader(
        model  = model,
        loader = loader,
    )
    
    pred = torch.cat([p['logits'].sigmoid().cpu() for p in predict])
    pred = pred.numpy().squeeze()
    
    return scoring(dataframe, pred)

## Train Models

In [10]:
!rm -rf {logdir}

In [11]:
params =  [
    {'features_size': 130, 'n_layers': 3},
    {'features_size': 130, 'n_layers': 4},
    {'features_size': 130, 'n_layers': 5},
]

In [12]:
models = []
scores = []

N_SPLITS = 6

tscv = TimeSeriesSplit(n_splits=N_SPLITS)
for i, (train_index, valid_index) in tqdm(enumerate(tscv.split(data.index)), total=N_SPLITS):
    
    dataframes = dict(
        train = data.loc[train_index],
        valid = data.loc[valid_index],
    )
    fold_model  = train_model(params, dataframes, f'fold_{i}')
    valid_score = score(fold_model, data.loc[valid_index])
    test_score  = score(fold_model, test)
    
    models.append(fold_model)
    scores.append((valid_score, test_score))

  0%|          | 0/6 [00:00<?, ?it/s]

[2021-02-22 18:29:44,049] 
1/20 * Epoch 1 (_base): lr=0.0050 | momentum=0.9000
1/20 * Epoch 1 (train): auc=0.5033 | loss=0.6965 | loss_0=0.6937 | loss_1=0.6955 | loss_2=0.7075 | loss_last=0.6941
1/20 * Epoch 1 (valid): auc=0.5066 | loss=0.6939 | loss_0=0.6934 | loss_1=0.6955 | loss_2=0.6940 | loss_last=0.6936
[2021-02-22 18:30:02,220] 
2/20 * Epoch 2 (_base): lr=0.0050 | momentum=0.9000
2/20 * Epoch 2 (train): auc=0.5138 | loss=0.6929 | loss_0=0.6925 | loss_1=0.6939 | loss_2=0.6935 | loss_last=0.6926
2/20 * Epoch 2 (valid): auc=0.5150 | loss=0.6931 | loss_0=0.6929 | loss_1=0.6938 | loss_2=0.6935 | loss_last=0.6928
[2021-02-22 18:30:20,030] 
3/20 * Epoch 3 (_base): lr=0.0050 | momentum=0.9000
3/20 * Epoch 3 (train): auc=0.5307 | loss=0.6919 | loss_0=0.6917 | loss_1=0.6934 | loss_2=0.6917 | loss_last=0.6915
3/20 * Epoch 3 (valid): auc=0.5199 | loss=0.6931 | loss_0=0.6926 | loss_1=0.6933 | loss_2=0.6944 | loss_last=0.6927
[2021-02-22 18:30:37,845] 
4/20 * Epoch 4 (_base): lr=0.0050 | mome

In [13]:
list(zip(*scores))[1]

(1397.549606904977,
 1262.8097925495044,
 1321.7008507083735,
 1129.4172823292492,
 1509.1101521204625,
 1497.360975625355)

In [14]:
class BlendModel(nn.Module):
    def __init__(self, models, weights):
        super().__init__()
        self.models = nn.ModuleList(models)
        self.weights = weights
    
    def forward(self, x):
        result = 0
        for model, w in zip(self.models, self.weights):
            result += w * model(x)['logits']
        
        return result

In [15]:
weights = np.array([i**2 for i in range(1, N_SPLITS)])
weights = weights/weights.sum()
weights

array([0.01818182, 0.07272727, 0.16363636, 0.29090909, 0.45454545])

In [16]:
full_model = BlendModel(models, weights)

In [17]:
def scoring(dataframe, pred):
    date   = dataframe['date'].values
    weight = dataframe['weight'].values
    resp   = dataframe['resp'].values
    
    best_treshold = None
    best_score    = -1e10
    for treshold in np.linspace(pred.min(), pred.max(), 100):
        score = utility_score_bincount(
            date,
            weight,
            resp,
            np.where(pred >= treshold, 1, 0).astype(int)
        )
        if score > best_score:
            best_treshold = treshold
            best_score    = score
            
    return best_treshold, best_score

def score(model, dataframe):
    loader = get_loaders({'valid':dataframe})['valid']
    
    runner = dl.SupervisedRunner(device='cuda:0', output_key=None)
    predict = runner.predict_loader(
        model  = model,
        loader = loader,
    )
    
    pred = torch.cat([p.sigmoid().cpu() for p in predict])
    pred = pred.numpy().squeeze()
    
    return scoring(dataframe, pred)

In [18]:
score(full_model, test)

(0.4950006562049942, 1514.1471737043144)

In [19]:
runner = dl.SupervisedRunner(device='cuda:0', output_key=None)
loaders = get_loaders({'train':data.loc[train_index]})
batch = next(iter(loaders["train"]))

runner.trace(
    model  = full_model,
    batch  = batch, 
    logdir = logdir,
)

model_traced = load_traced_model(
    f"{logdir}/trace/traced-forward.pth", 
    device=device,
)

In [21]:
score(full_model, data.query('date >= 450').reset_index(drop=True))

(0.527714569460262, 510.82989249216945)

In [22]:
score(full_model, pd.concat([data.query('date >= 450').reset_index(drop=True), test]).reset_index())

(0.5028380646248056, 1975.5903136660158)