<a href="https://colab.research.google.com/github/NazarioR9/BNBR_Challenge/blob/master/notebooks/MLMRobertaBaseGenericModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Specs

In [None]:
!nvidia-smi

Sun Jul  5 13:21:58 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Install

In [None]:
!pip install transformers catalyst --quiet

In [None]:
!pip install torchcontrib --quiet

#Imports

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import os, sys, gc
import time
import string
import subprocess
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, f1_score, accuracy_score
from sklearn.metrics import classification_report

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
import transformers
from transformers import AutoConfig, AutoTokenizer, BartConfig, RobertaTokenizer

In [None]:
! cp -r ./../bnbr/ bnbr

In [None]:
sys.path.append('./../bnbr')

In [None]:
import bnbr
from bnbr.activation import *
from bnbr.data import *
from bnbr.models import *
from bnbr.loss import *
from bnbr.utils import *

Using TensorFlow backend.


#Envs

In [None]:
!unzip ./../mlm_finetuned_models/mlm_roberta_base_

In [None]:
seed = 2020
seeds = [42, 5, 12]
n_folds = 10
model_name = 'roberta-base'
model_path = 'fms/'
dirs = ['models/', f'evals/{model_name}']
exts = ['.csv', '.pkl']

In [None]:
wm = WorkplaceManager(seed, dirs, exts)

Set seed to 2020.
Deleted models/ evals/roberta-base
Deleted .csv .pkl
Created models/ evals/roberta-base


#Tokenizer

In [None]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#Data

In [None]:
path = './../data/'

In [None]:
os.listdir(path)

In [None]:
train = pd.read_csv(path+'final_train.csv')
test = pd.read_csv(path+'final_test.csv')
sample = pd.read_csv(path+'SampleSubmission.csv')

In [None]:
train.shape

(597, 7)

In [None]:
train.head()

Unnamed: 0,ID,text,label,Depression,Alcohol,Suicide,Drugs
0,SUAVK39Z,i feel that it was better i die am happy,0,1,0,0,0
1,9JDAGUV3,why do i get hallucinations ?,3,0,0,0,1
2,419WR1LQ,i am stressed due to lack of financial suppor...,0,1,0,0,0
3,6UY7DX6Q,why is life important ?,2,0,0,1,0
4,FYC0FTFB,how could i be helped to go through the depre...,0,1,0,0,0


In [None]:
train = train[['text', 'Depression', 'Alcohol', 'Suicide', 'Drugs', 'label']]

In [None]:
train['length'] = train['text'].apply(lambda x: len(tokenizer.encode(x)))
test['length'] = test['text'].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
train.head()

Unnamed: 0,text,Depression,Alcohol,Suicide,Drugs,label,length
0,i feel that it was better i die am happy,1,0,0,0,0,12
1,why do i get hallucinations ?,0,0,0,1,3,9
2,i am stressed due to lack of financial suppor...,1,0,0,0,0,13
3,why is life important ?,0,0,1,0,2,8
4,how could i be helped to go through the depre...,1,0,0,0,0,14


In [None]:
train.label.value_counts()

0    344
1    132
2     65
3     56
Name: label, dtype: int64

#Split in folds

In [None]:
train.head()

Unnamed: 0,text,Depression,Alcohol,Suicide,Drugs,label,length
0,i feel that it was better i die am happy,1,0,0,0,0,12
1,why do i get hallucinations ?,0,0,0,1,3,9
2,i am stressed due to lack of financial suppor...,1,0,0,0,0,13
3,why is life important ?,0,0,1,0,2,8
4,how could i be helped to go through the depre...,1,0,0,0,0,14


In [None]:
kf = StratifiedKFold(n_folds)
#train['fold'] = 0
for fold, (tr_index, val_index) in enumerate(kf.split(train.index, train.label)):
  train.loc[val_index, 'fold'] = fold

In [None]:
train.head()

Unnamed: 0,text,Depression,Alcohol,Suicide,Drugs,label,length,fold
0,i feel that it was better i die am happy,1,0,0,0,0,12,0.0
1,why do i get hallucinations ?,0,0,0,1,3,9,0.0
2,i am stressed due to lack of financial suppor...,1,0,0,0,0,13,0.0
3,why is life important ?,0,0,1,0,2,8,0.0
4,how could i be helped to go through the depre...,1,0,0,0,0,14,0.0


#Utilities

In [None]:
class GlobalConfig:
  accumulate_grad_batches = 1
  activation = torch.softmax
  batch_size = 32
  clip_grad = False
  epochs = 10
  folds = n_folds
  hidden_multiplier = 1
  high_dropout = 0.5 #.3
  id = 0
  L = 1
  loss = MHMLoss('ce', 10)
  loss_name = 'ce'
  low_dropout = 0.3
  lr = 4.5e-5
  max_grad_norm = 1.0
  max_tokens = None #
  mhm_activation = Mish()
  model_name = model_name
  model_path = model_path
  multi_samp_nb = 3
  n = 11
  output_hidden_states = True
  reinit = False
  scheduler = False
  swa = False
  swa_start = 4
  swa_freq = 1
  swa_lr = 3e-5
  task = 'train'
  warmup_steps = 0
  

  def __init__(self, xp_nb=0):
    self.xp_nb = xp_nb
    self._name()
    self._lightweight()

  def update(self, **kwargs):
    for name in kwargs.keys():
      if hasattr(self, name):
        setattr(self, name, kwargs[name])

  def _lightweight(self):
    self.fold = 0
    self.train_df = None
    self.test_df = None
    self.val_df = None

  def save(self, path=''):
    self._lightweight()
    
    with open(path+self.name, 'wb') as f:
      pickle.dump(self, f)
      f.close()

  def _name(self):
    if not hasattr(self, 'name'):
      vocab = string.ascii_letters + string.digits
      for i in range(self.xp_nb):
        n = ''
        for _ in range(10):
          n += vocab[random.randint(0, len(vocab)-1)]

      self.name = 'config_'+n+'.pkl'

In [None]:
global global_config
global_config = GlobalConfig(0)

In [None]:
def run_fold(score_pkl='cv_score.pkl'):
  fold_scores = []
  print_dict_list = {}

  try:
    with open(score_pkl, 'rb') as f:
      cv_score = pickle.load(f)
      f.close()
  except FileNotFoundError:
    cv_score = 0

  completed = len(os.listdir('models/'))

  for fold in train.fold.unique():
    if fold < completed: continue

    # df_train = balance_training(train.iloc[tr_index].copy())
    df_train = train[train.fold != fold].copy()
    df_val = train[train.fold == fold].copy()
    print_dict_list[fold] = []
    best_score = np.inf

    global_config.update(
      fold = fold,
      train_df=df_train,
      val_df=df_val
    )


    trainer = Trainer(global_config)
    es = EarlyStopping()
    
    for epoch in range(global_config.epochs):
      trainer.fit_one_epoch(epoch)
      # es.update(trainer.scores[epoch][-1])

      # if es.stop and epoch >=5:
      #   break

    trainer.save_best_eval()
    trainer.evaluate_post_processing()

    cv_score += trainer.best_log

    os.system(f"rm {score_pkl}")
    with open(score_pkl, 'wb') as f:
      pickle.dump(cv_score, f)
      f.close()

    trainer = None, None
    del trainer, df_train, df_val
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def run_prediction(path='models/'):
  preds = []
  global_config.update(
      task='test',
      test_df=test
    )

  for fold in train.fold.unique():

    module = LightTrainingModule(global_config)
    module.load_state_dict(torch.load(path+f'model_{fold}.bin'))
    trainer = Trainer(global_config, module=module)
    
    trainer.predict()
    pred = trainer.get_preds()

    preds.append(pred)

    module, trainer = None, None
    del trainer, module
    torch.cuda.empty_cache()
    gc.collect()

  return preds

#The Torch Way

##Run folds

In [None]:
gc.collect()

817

In [None]:
run_fold()

____________________________________________________________________________________________________ 
Fold  9.0
	____________________________________________________________________________________________________
	| Epoch: 0 - Loss: 1.15813 - ValLoss: 1.07970 - Log: 1.07971 - F1: 0.41684 - LogF1: 0.22295
	____________________________________________________________________________________________________
	| Epoch: 1 - Loss: 0.93292 - ValLoss: 0.77587 - Log: 0.77587 - F1: 0.65357 - LogF1: 0.43885
	____________________________________________________________________________________________________
	| Epoch: 2 - Loss: 0.59933 - ValLoss: 0.62572 - Log: 0.62572 - F1: 0.73155 - LogF1: 0.55292
	____________________________________________________________________________________________________
	| Epoch: 3 - Loss: 0.38360 - ValLoss: 0.52326 - Log: 0.52326 - F1: 0.87168 - LogF1: 0.67421
	____________________________________________________________________________________________________
	| Epo

In [None]:
# wm.clear(['evals/', 'models/'])

# CrossVal Score

In [None]:
cvl = CrossValLogger(train, n_folds, path=f'evals/{model_name}/')

In [None]:
score1, score2 = cvl.show_results(True)

OOF_CV_SCORE: 0.39070 | OVR_SCORE: 0.38655


In [None]:
gc.collect()

30

#Save config

In [None]:
global_config.save()

##Prediction

In [None]:
%%time
preds = run_prediction()

CPU times: user 1min 1s, sys: 6.81 s, total: 1min 8s
Wall time: 3min 34s


In [None]:
raw_outputs = np.mean(preds, axis=0)

#Submission

In [None]:
!mkdir ./../submissions

In [None]:
spath = './../submissions/'

In [None]:
sample[["Depression","Alcohol","Suicide","Drugs"]] = raw_outputs

In [None]:
sample.to_csv(spath + f'{model_name}__.csv', index=False)

In [None]:
sample.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.726017,0.025086,0.227651,0.021246
1,03BMGTOK,0.990635,0.001721,0.006303,0.001341
2,03LZVFM6,0.991956,0.001654,0.005111,0.00128
3,0EPULUM5,0.991569,0.001625,0.00558,0.001226
4,0GM4C5GD,0.006329,0.232706,0.016765,0.744201


In [None]:
sample.describe()

Unnamed: 0,Depression,Alcohol,Suicide,Drugs
count,309.0,309.0,309.0,309.0
mean,0.544079,0.215507,0.146051,0.094363
std,0.446554,0.36899,0.258686,0.233527
min,0.004002,0.001548,0.004506,0.001207
25%,0.009855,0.0018,0.005514,0.00142
50%,0.765412,0.016647,0.008498,0.012684
75%,0.99044,0.122572,0.124758,0.035496
max,0.992491,0.967638,0.903291,0.898146


In [None]:
sample[["Depression","Alcohol","Suicide","Drugs"]].apply(np.argmax, axis=1).value_counts()

0    171
1     64
2     48
3     26
dtype: int64