### install packages

In [1]:
!pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html

Looking in links: https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html


In [3]:
!pip install transformers



In [5]:
import torch
import torch.nn as nn
import random
import time
import math

import pandas as pd
import numpy as np
import collections
from collections import Counter

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import f1_score

from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim import Optimizer
import torch.nn.functional as F

#from apex.fp16_utils import *
#from apex import amp, optimizers
#from apex.multi_tensor_apply import multi_tensor_applier

In [6]:
print(torch.__version__)

1.2.0.dev20190805


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


print(f'random seeds initialized, working on {device}')

random seeds initialized, working on cuda


In [8]:
#getting the clinical biobert tokenizer
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [9]:
#loads result dataframe or initializes an empty one
try:
  results_df = pd.read_csv('/content/drive/My Drive/ML_data/results.csv', index_col = 0)
except:
  results_df = pd.DataFrame(columns = ['experiment description', 'num samples', 'weighting', 'f1w', 'acc', 'auroc', 'ppv', 'sens', 'batch size'])

results_df.index = list(range(len(results_df)))   #to clean the index 
results_df

Unnamed: 0,experiment description,num samples,weighting,f1w,acc,auroc,ppv,sens,batch size
0,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,,32
1,"larger test dataset, only subj notes",10000,,0.854283,0.8995,0.780036,0.01,0.4,32
2,"larger test dataset, only subj notes",50000,,0.876438,0.8969,0.795039,0.178138,0.44557,32
3,"larger test dataset, only subj notes",100000,,0.87697,0.901,0.807763,0.155172,0.493548,32
4,"50k repr samples, both nlp fields",50000,,0.881214,0.8954,0.798594,0.237219,0.43609,16
5,"weighted loss, both nlp fields",50000,"{'discharge': 8, 'admit': 1}",0.865623,0.9012,0.808986,0.070281,0.530303,16
6,"first try lamb, both nlp fields",10000,,0.876643,0.895,0.788044,0.2,0.444444,16
7,"mixed prec, adults only, lamb",100000,,0.877511,0.89,0.82424,0.27619,0.460317,32
8,"mixed prec, bch only, lamb",102128,,0.885827,0.900519,0.802072,0.23323,0.452906,64
9,"mixed prec, bch only, lamb",50000,,0.859869,0.905,0.5,0.0,,16


### define helper functions

In [10]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return sklearn.metrics.accuracy_score(labels_flat, preds_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [11]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [12]:
def train_model_bak(model, dataloader_train, dataloader_valid, save_name, loss_fn = 'default', optim = 'default', TPU = False, lr = 1e-3, eps = 1e-8, epochs = 3, fp_16 = False):
  
  if fp_16 == False:
    model.to(device)

  if optim == 'default':
    print ('optimizing with AdamW')
    optimizer = AdamW(model.parameters(), lr=lr, eps=eps, weight_decay=0.01)
  else:
    print ('optimizing with ', str(optim))
    optimizer = optim

  if fp_16 == True:
    model, optimizer = amp.initialize(model.to(device), optimizer)
  
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
  
  for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        if loss_fn != 'default':
          loss = loss_fn(outputs[1], batch[2])
        else:
          loss = outputs[0]

        loss_train_total += loss.item()
        
        if fp_16:
          with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        else:
          loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        if TPU:
          xm.optimizer_step(optimizer, barrier=True)
        else:
          optimizer.step()

        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/content/{save_name}_BERT_epoch_{epoch}.model')
    if epochs == epochs + 1:
      torch.save(model.state_dict(), f'/content/drive/My Drive/ML_data/models/{save_name}_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_acc = accuracy_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')


In [28]:
def train_model(model, dataloader_train, dataloader_valid, save_name, lr = 1e-5, eps = 1e-8, epochs = 3):
  
  model.to(device)

  optimizer = AdamW(model.parameters(), lr=lr, eps=eps, weight_decay=0.01)
    
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
  
  for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       


        with autocast():
          outputs = model(**inputs)

        #loss = loss_fn(output, target)
        #outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        #optimizer.step()
        #with amp.scale_loss(loss, optimizer) as scaled_loss:
        #    scaled_loss.backward()
        
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_{save_name}_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_acc = accuracy_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')


In [14]:
def get_metrics(dataloader): #(predictions, true_vals, dataloader):

  _, predictions, true_vals = evaluate(dataloader)

  preds = [np.argmax(pred) for pred in predictions]
  preds_flat = np.argmax(preds).flatten()
  true_vals = true_vals.flatten()

  f1_w = sklearn.metrics.f1_score(true_vals, preds, average='weighted')
  f1 = sklearn.metrics.f1_score(true_vals, preds, average=None)
  acc = sklearn.metrics.accuracy_score(true_vals, preds)
  prec = sklearn.metrics.precision_score(true_vals,preds, average=None) 
  rec = sklearn.metrics.recall_score(true_vals,preds, average=None)
  auroc = sklearn.metrics.roc_auc_score(true_vals,predictions[:,1], average=None)
  confusion = sklearn.metrics.confusion_matrix(true_vals, preds)

  tn, fn, fp, tp = confusion[0,0], confusion[0,1], confusion[1,0], confusion[1,1]

  sens = tp/(tp + fn)
  spec = tn/(tn + fp)
  ppv = tp/(tp + fp)
  npv = tn/(tn + fn)

  print ('Metrics Report:')
  print ('---------------')
  print ('weighted f1: ', f1_w)
  print ('AUROC:       ',auroc)
  print ('accuracy:    ', acc)
  print ('precision:   ', prec)
  print ('recall:      ', rec)
  print ('sensitivity: ', sens)
  print ('specificity: ', spec)
  print ('PPV:         ', ppv)
  print ('NPV:         ', npv)
  print ()
  print ('confusion matrix')
  print (confusion)

  results_df.loc[len(results_df)] = [desc,num_samples, weights, f1_w, acc, auroc, ppv, sens, batch_size]

In [15]:
def encode_data(df, text_field):
  encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'][text_field].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
    )

  encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'][text_field].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
    )


  input_ids_train = encoded_data_train['input_ids']
  attention_masks_train = encoded_data_train['attention_mask']
  labels_train = torch.tensor(df[df.data_type=='train'].label.values)

  input_ids_val = encoded_data_val['input_ids']
  attention_masks_val = encoded_data_val['attention_mask']
  labels_val = torch.tensor(df[df.data_type=='val'].label.values)

  return input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val



def create_dataloaders(input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val, batch_size = 32):
  dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
  dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

  dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

  dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
  
  return dataloader_train, dataloader_validation

def process_data(data):
  df = data.sample(num_samples)
  df.discharge.value_counts()
  label_dict = {'discharge':0, 'admit':1}; label_dict
  df['label'] = df.discharge.replace(label_dict)
  df['text'] = df['CleanSubjectiveNotes'].map(str) + ', ' + df['pmhx'].map(str)
  df = df[['discharge', 'label', 'text']]
  display(df.head())
  return df, label_dict


def split_data(df, test_size=0.1):
  X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=test_size, 
                                                  random_state=17, 
                                                  stratify=df.label.values)
  df['data_type'] = ['not_set']*df.shape[0]
  df.loc[X_train, 'data_type'] = 'train'
  df.loc[X_val, 'data_type'] = 'val'
  display(df.groupby(['discharge', 'label', 'data_type']).count())
  return X_train, X_val, y_train, y_val

In [16]:
class Lamb(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
                 weight_decay=0.01, adam=False):   # I changed wd default from 0
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay)
        self.adam = adam
        super(Lamb, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                # Paper v3 does not use debiasing.
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                # Apply bias to lr to avoid broadcast.
                step_size = group['lr']  * math.sqrt(bias_correction2) / bias_correction1

                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)

                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
                if group['weight_decay'] != 0:
                    adam_step.add_(group['weight_decay'], p.data)

                adam_norm = adam_step.pow(2).sum().sqrt()
                if weight_norm == 0 or adam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / adam_norm
                state['weight_norm'] = weight_norm
                state['adam_norm'] = adam_norm
                state['trust_ratio'] = trust_ratio
                if self.adam:
                    trust_ratio = 1

                p.data.add_(-step_size * trust_ratio, adam_step)

        return loss


In [17]:

ALPHA = 0.8
GAMMA = 2

class FocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalLoss, self).__init__()

    def forward(self, inputs, targets, alpha=ALPHA, gamma=GAMMA, smooth=1):
        
        #comment out if your model contains a sigmoid or equivalent activation layer
        inputs = torch.sigmoid(inputs)       
        
        #flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        
        #first compute binary cross-entropy 
        BCE = F.binary_cross_entropy(inputs, targets, reduction='mean')
        BCE_EXP = torch.exp(-BCE)
        focal_loss = alpha * (1-BCE_EXP)**gamma * BCE
                       
        return focal_loss

In [18]:

ALPHA = 0.8
GAMMA = 2

class FocalLoss2(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalLoss2, self).__init__()

    def forward(self, inputs, targets, alpha=ALPHA, gamma=GAMMA, smooth=1):
        
        #comment out if your model contains a sigmoid or equivalent activation layer
        #inputs = torch.sigmoid(inputs)       
        
        #flatten label and prediction tensors
        #inputs = inputs.view(-1)
        #targets = targets.view(-1)
        
        #first compute binary cross-entropy 
        BCE = F.binary_cross_entropy_with_logits(inputs, targets, reduction='mean')
        BCE_EXP = torch.exp(-BCE)
        focal_loss = alpha * (1-BCE_EXP)**gamma * BCE
                       
        return focal_loss



In [19]:
class JJFocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(JJFocalLoss, self).__init__()

    def forward(self, inputs, targets, alpha=0.8, gamma=2, smooth=1):
        
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none') # important to add reduction='none' to keep per-batch-item loss
        pt = torch.exp(-ce_loss)
        focal_loss = (alpha * (1-pt)**gamma * ce_loss).mean()
                       
        return focal_loss

# import the dataframe

In [20]:
data = pd.read_csv  ('/content/drive/My Drive/ML_data/complete_clean_combo_data.csv', index_col = None, low_memory = False)
#data = pd.read_csv('/content/drive/My Drive/ML_data/data_1000.csv', index_col = None)

In [21]:
weights = None
num_samples = 10000
desc = 'restart adding back mixed prec'
batch_size = 16

### Exploratory Data Analysis and Preprocessing

In [22]:
df, label_dict = process_data(data)

Unnamed: 0,discharge,label,text
102395,discharge,0,rectal bleeding x 4 days. bowel movement x 5 t...
112374,discharge,0,unable to void x 4hours and with lower abdo pa...
111136,discharge,0,"right lower leg pain,calf pain and knee pain f..."
140645,discharge,0,states felt itchy in the left eye this afterno...
117690,discharge,0,mom states that patient started summer camp on...


### Training/Validation Split

In [23]:
X_train, X_val, y_train, y_val = split_data(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,900
admit,1,val,100
discharge,0,train,8100
discharge,0,val,900


### Loading Tokenizer and Encoding our Data

In [24]:
input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val = encode_data(df, 'text')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.


In [25]:
dataloader_train, dataloader_validation = create_dataloaders(input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val,batch_size)

In [26]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [73]:
train_model(model, dataloader_train, dataloader_validation, 'restart with fp16')

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=563.0, style=ProgressStyle(description_widt…

RuntimeError: ignored

In [None]:
get_metrics(dataloader_validation)

Metrics Report:
---------------
weighted f1:  0.8667957310722557
AUROC:        0.783511111111111
accuracy:     0.893
precision:    [0.90918473 0.38709677]
recall:       [0.97888889 0.12      ]
sensitivity:  0.3870967741935484
specificity:  0.9091847265221878
PPV:          0.12
NPV:          0.9788888888888889

confusion matrix
[[881  19]
 [ 88  12]]


In [None]:
results_df.to_csv('/content/drive/My Drive/ML_data/results.csv')

In [29]:
#import torch.cuda.amp
#from torch.cuda.amp import autocast

train_model(model, dataloader_train, dataloader_validation, 'restart with torch.amp')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=563.0, style=ProgressStyle(description_widt…




NameError: ignored

In [None]:
weights = None
num_samples = 50000
desc = 'clean start, mixed prec, labelsmooth, wd, TPU'
batch_size = 16

In [None]:
train_model(model, dataloader_train, dataloader_validation, 'clean_start_3', fp_16 = True, loss_fn=loss_fn)

optimizing with AdamW
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2813.0, style=ProgressStyle(description_wid…




RuntimeError: ignored

## Notes to self:

- first step - adamw optimizer, standard precision
- second step - adamw optimizer, mixed precision
- third step - adamw optimizer, label smoothing
- 4th step - adamw optimzer, mixup?
- 5th step - try lamb
- 6th step - try weighted loss
- 7th step - better cleaned data
- 8th step - agumentation of minor class