In [1]:
from data_loading import load_data
from sklearn.model_selection import KFold
import neptune
import torch
import numpy as np
import random
import os
import pandas as pd
import math
from IPython.display import display, Javascript
import pytorch_lightning as pl
import torch.nn.functional as F
from pytorch_lightning import Trainer, seed_everything
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold



In [2]:
neptune.init('iliaavilov/MOA-prediction')

Project(iliaavilov/MOA-prediction)

In [3]:
random_state = 999
def set_seed(seed = 999) -> None:
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Загрузка данных

In [4]:
train_targets, test_features, train_features = load_data(pca_features = None)

In [5]:
y = train_targets.values[:10000]
X = train_features.values[:10000]

# Валидация

In [6]:
mlsk = MultilabelStratifiedKFold(n_splits = 5, shuffle = True, random_state = random_state)
cv = list(mlsk.split(X, y))



In [7]:
device = torch.device('cuda:0')

In [8]:
y = torch.from_numpy(y).float().to(device)
X = torch.from_numpy(X).float().to(device)

# Модель

## Model class

In [9]:
class simple_torchpl(pl.LightningModule):
    
    def __init__(self, n_in, n_h, n_out, fold_num):
        super(simple_torchpl, self).__init__()
        self.drop1 = torch.nn.Dropout(0.5)
        self.bn1 = torch.nn.BatchNorm1d(n_in)
        self.fc1 = torch.nn.Linear(n_in, n_h)
        self.hardswish = torch.nn.Hardswish()
        self.bn2 = torch.nn.BatchNorm1d(n_h)
        self.drop2 = torch.nn.Dropout(0.4)
        self.fc2 = torch.nn.Linear(n_h, n_h)
        self.fc3 = torch.nn.Linear(n_h, n_out)
        self.sigm = torch.nn.Sigmoid()
        
        self.fold_num = fold_num
        
        self.train_score = []
        self.val_score = []
        self.train_score_targets = []
        self.val_score_targets = []

    def forward(self, x):
        x = self.drop1(x)
        x = self.bn1(x)
        x = self.fc1(x)
        x = self.hardswish(x)
        x = self.bn2(x)
        x = self.drop2(x)
        x = self.fc2(x)
        x = self.hardswish(x)
        x = self.drop2(x)
        x = self.fc2(x)
        x = self.hardswish(x)
        x = self.fc3(x)
        out = self.sigm(x)
        return out
    
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.forward(x)
        train_loss = F.binary_cross_entropy(y_pred, y)
        self.train_score.append(F.binary_cross_entropy(y_pred[:, :206], y[:, :206]).item())
        for i in range(0, 206):
            self.train_score_targets.append(F.binary_cross_entropy(y_pred[:, i], y[:, i]).item())
        return(train_loss)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.forward(x)
        test_loss = F.binary_cross_entropy(y_pred, y)
        self.val_score.append(F.binary_cross_entropy(y_pred[:, :206], y[:, :206]).item())
        for i in range(0, 206):
            self.train_score_targets.append(F.binary_cross_entropy(y_pred[:, i], y[:, i]).item())
        return(test_loss)
    
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), weight_decay = 0.7)

## Training

In [10]:
class nn_training():
    
    def __init__(self, X, y, cv, train_batch_size, epochs, neptune_logging = True):
        
        self.train_loss_folds = np.array([])
        self.test_loss_folds = np.array([])
        
        self.epochs = epochs
        self.neptune_logging = neptune_logging
        self.cv = cv
        self.X = X
        self.y = y
        self.train_batch_size = train_batch_size
    
    def data_loaders(self, fold):
        
        X_train = X[fold[0]]
        y_train = y[fold[0]]
        X_test = X[fold[1]]
        y_test = y[fold[1]]
        set_seed(seed = random_state)
        train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
        train_loader = torch.utils.data.DataLoader(train_dataset, 
                                                   batch_size = self.train_batch_size, shuffle = False)

        val_dataset = torch.utils.data.TensorDataset(X_test, y_test)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = y_test.shape[0], shuffle = False)
        
        return(train_loader, val_loader)
    
    def train(self):
        for fold, fold_num in zip(self.cv, list(range(0, len(self.cv)))):
            self.fold_train(fold, fold_num)
            if (fold_num+1 == len(self.cv)) & (self.neptune_logging == True):
                self.logging_stats()
    
    def fold_train(self, fold, fold_num):
        train_loader, val_loader = self.data_loaders( fold)
        set_seed(seed = random_state)
        l_model = simple_torchpl(X.shape[1], 1024, y.shape[1], fold_num)
        trainer = pl.Trainer(min_epochs = self.epochs, 
                             max_epochs = self.epochs,
                             progress_bar_refresh_rate = 0,
                             early_stop_callback = False,
                             num_sanity_val_steps = 0, 
                             gpus = 1)
    
        ## Тренирум модель
        trainer.fit(l_model, train_loader, val_loader)
        if self.neptune_logging == True:
            self.logging(l_model.val_score, l_model.train_score, 
                         l_model.val_score_targets, l_model.train_score_targets, 
                         train_loader, fold_num)
            
            
    def logging(self, test_score, train_score, test_score_target, train_score_target, train_loader, fold_num):
        global tt, pp
        test_score = np.array(test_score)
        train_score = np.array(train_score)
        test_score_target = np.array(test_score_target)
        train_score_target = np.array(train_score_target)
        train_score_target = np.reshape(train_score_target, 
                                        (-1, 
                                         206, 
                                         math.ceil(np.array([batch[0].shape[0] for batch in train_loader]).sum()/
                                                   self.train_batch_size)))
        tt = train_score_target
        train_score = np.reshape(train_score, 
                                 (-1, 
                                  math.ceil(np.array([batch[0].shape[0] for batch in train_loader]).sum()/
                                            self.train_batch_size)))
        pp = train_score
        weights = [batch[0].shape[0] for batch in train_loader]
        train_score = np.average(train_score, weights = weights, axis = 1)
        train_score_target = np.average(train_score, weights = weights, axis = 2)
        
        neptune.log_metric('train loss target {} fold {}'.format(fold_num), loss_train)
        neptune.log_metric('test loss target {} fold {}'.format(fold_num), loss_test)

        for loss_train, loss_test in zip(train_score, test_score):
            neptune.log_metric('train loss fold {}'.format(fold_num), loss_train)
            neptune.log_metric('test loss fold {}'.format(fold_num), loss_test)

        self.train_loss_folds = np.append(self.train_loss_folds, train_score)
        self.test_loss_folds = np.append(self.test_loss_folds, test_score)
        
    def logging_stats(self):
        train_loss_folds_mean = np.mean(self.train_loss_folds.reshape(len(self.cv), -1), axis = 0)
        test_loss_folds_mean = np.mean(self.test_loss_folds.reshape(len(self.cv), -1), axis = 0)

        train_loss_folds_var = np.var(self.train_loss_folds.reshape(len(self.cv), -1), axis = 0)
        test_loss_folds_var = np.var(self.test_loss_folds.reshape(len(self.cv), -1), axis = 0)

        for train_loss_mean, test_loss_mean, train_loss_var, test_loss_var in zip(train_loss_folds_mean, test_loss_folds_mean,
                                                                                  train_loss_folds_var, test_loss_folds_var):
            neptune.log_metric('train loss mean', train_loss_mean)
            neptune.log_metric('test loss mean', test_loss_mean)
            neptune.log_metric('train loss var', train_loss_var)
            neptune.log_metric('test loss var', test_loss_var)

        neptune.log_metric('var at min mean train', train_loss_folds_var[np.argmin(train_loss_folds_mean)])
        neptune.log_metric('var at min mean test', train_loss_folds_var[np.argmin(test_loss_folds_mean)])

In [11]:
training_nn = nn_training(X, y, cv, train_batch_size = 5048, epochs = 1000, neptune_logging = True)

In [12]:
display(Javascript('IPython.notebook.save_checkpoint();'))
neptune.create_experiment(upload_source_files=['Start.ipynb', 'data_loading.py'], 
                          description = 'MOA 260. Batch size 5048. One more layer',
                          tags = ['Multitarget strat. kfold'])
training_nn.train()
neptune.stop()

<IPython.core.display.Javascript object>

https://ui.neptune.ai/iliaavilov/MOA-prediction/e/MOA-278


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | drop1     | Dropout     | 0     
1 | bn1       | BatchNorm1d | 1 K   
2 | fc1       | Linear      | 900 K 
3 | hardswish | Hardswish   | 0     
4 | bn2       | BatchNorm1d | 2 K   
5 | drop2     | Dropout     | 0     
6 | fc2       | Linear      | 1 M   
7 | fc3       | Linear      | 623 K 
8 | sigm      | Sigmoid     | 0     
Saving latest checkpoint..


IndexError: tuple index out of range

In [None]:
train_score_target1[0][0]

In [None]:
aa = np.average(train_score_target1, weights = [1], axis = 2)

In [None]:
aa.shape