#Specs

In [None]:
!nvidia-smi

Thu Dec 24 19:49:18 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Install

In [None]:
!pip install transformers torchcontrib catalyst -q

[K     |████████████████████████████████| 1.5MB 8.0MB/s 
[K     |████████████████████████████████| 491kB 30.3MB/s 
[K     |████████████████████████████████| 2.9MB 41.8MB/s 
[K     |████████████████████████████████| 890kB 62.3MB/s 
[K     |████████████████████████████████| 317kB 57.1MB/s 
[?25h  Building wheel for torchcontrib (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


#Imports

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import os, sys, gc
import time
import string
import subprocess
import numpy as np
import pandas as pd

In [None]:
from tqdm.notebook import tqdm

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, f1_score, accuracy_score
from sklearn.metrics import classification_report

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
import transformers
from transformers import AutoConfig, AutoTokenizer

In [None]:
from nlp.all import *

#Envs

In [None]:
seed = 1999
seeds = [42, 5, 2021]
n_folds = 5
model_name = 'roberta-base'
config_name = model_name
tok_name = model_name
dirs = ['models/', f'evals/{model_name}']
exts = ['.csv', '.pkl']

In [None]:
wm = WorkplaceManager(seed, dirs, exts, n_folds)

Set seed to 1999.
Created models/ evals/roberta-base submissions/


#Tokenizer

In [None]:
config = AutoConfig.from_pretrained(config_name)
tokenizer = AutoTokenizer.from_pretrained(tok_name, config=config)

#Data

In [None]:
path = 'data/'

In [None]:
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')
sample = pd.read_csv(path+'SampleSubmission.csv')

In [None]:
train.shape

(70000, 3)

In [None]:
train.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [None]:
train['length'] = train['text'].apply(lambda x: len(tokenizer.encode(x)))
test['length'] = test['text'].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
train.head()

Unnamed: 0,ID,text,label,length
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,34
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,24
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,164
3,U0TTYY8,ak slouma,1,6
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,59


In [None]:
train['target'] = train['label'].values
train['label'] += 1

In [None]:
train.label.value_counts(normalize=True)

2    0.546271
0    0.418500
1    0.035229
Name: label, dtype: float64

#Split in folds

In [None]:
train.head()

Unnamed: 0,ID,text,label,length,target
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,0,34,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,0,24,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,0,164,-1
3,U0TTYY8,ak slouma,2,6,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,0,59,-1


In [None]:
kf = StratifiedKFold(n_folds)
train['fold'] = 0
for fold, (tr_index, val_index) in enumerate(kf.split(train.index, train.label)):
  train.loc[val_index, 'fold'] = fold

In [None]:
train.head()

Unnamed: 0,ID,text,label,length,target,fold
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,0,34,-1,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,0,24,-1,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,0,164,-1,0
3,U0TTYY8,ak slouma,2,6,1,0
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,0,59,-1,0


#Utilities

In [None]:
class GlobalConfig:
  id = 0
  task = 'train'

  accumulate_grad_batches = 1
  activation = torch.softmax
  batch_size = 64
  warmup_steps = 0

  finetune_epochs = 2
  epochs = 3
  folds = n_folds

  low_dropout = 0.1
  high_dropout = 0.1
  
  loss = Loss('ce')
  loss_name = 'ce'
  metric_name = 'Acc'
  
  lr = 5e-5
  head_lr = 1e-3
  scheduler = False
  swa = False
  swa_start = 4
  swa_freq = 1
  swa_lr = 3e-5

  clip_grad = False
  max_grad_norm = 1.0
  max_tokens = 50
  on_batch = True

  model_name = model_name
  config_name = config_name
  pretrained = True

  n_classes = 3

  def __init__(self, xp_nb=1):
    self.xp_nb = xp_nb
    self._reset()

  def update(self, **kwargs):
    for name in kwargs.keys():
      if hasattr(self, name):
        setattr(self, name, kwargs[name])

  def _reset(self):
    self.fold = 0
    self.train_df = None
    self.test_df = None
    self.val_df = None

  def save(self, path=''):
    self._reset()
    self._name()
    
    with open(path+self.name, 'wb') as f:
      pickle.dump(self, f)

  def _name(self):
    n = ''
    if not hasattr(self, 'name'):
      vocab = string.ascii_letters + string.digits
      for i in range(self.xp_nb):
        n = ''
        for _ in range(10):
          n += vocab[random.randint(0, len(vocab)-1)]

      self.name = n+'.pkl'
    print(self.name)

In [None]:
def run_fold(score_pkl='cv_score.pkl'):
  torch.cuda.empty_cache()
  
  fold_scores = []
  print_dict_list = {}

  try:
    with open(score_pkl, 'rb') as f:
      cv_score = pickle.load(f)
      f.close()
  except FileNotFoundError:
    cv_score = 0

  completed = len(os.listdir('models/'))

  for fold in train.fold.unique():
    if fold < completed: continue

    print('Fold', fold)

    df_train = train[train.fold != fold].copy()
    df_val = train[train.fold == fold].copy()
    print_dict_list[fold] = []
    best_score = np.inf

    global_config.update(
      fold = fold,
      train_df = df_train,
      val_df = df_val,
      test_df = pd.DataFrame()
    )

    trainer = Trainer(global_config)
    es = EarlyStopping()

    trainer.finetune()
    trainer.fit()

    trainer.save_best_eval()

    cv_score += trainer.best_metric

    os.system(f"rm {score_pkl}")
    with open(score_pkl, 'wb') as f:
      pickle.dump(cv_score, f)
      f.close()

    trainer = None
    del trainer, df_train, df_val
    gc.collect()

In [None]:
def run_prediction(path='models/'):
  torch.cuda.empty_cache()
  
  preds = []
  global_config.update(
      task='test',
      train_df = test,
      val_df = test,
      test_df=test
    )

  for fold in tqdm(train.fold.unique()):

    module = LightTrainingModule(global_config)
    module.load_state_dict(torch.load(path+f'model_{fold}.bin'))
    trainer = Trainer(global_config, module=module)
    
    trainer.predict()
    pred = trainer.get_preds()

    preds.append(pred)

    module, trainer = None, None
    del trainer, module
    gc.collect()

  return preds

In [None]:
def submit(preds_dict):
  for name, preds in preds_dict.items():
    submission = test[['ID']]
    submission['label'] = outputs-1

    submission.to_csv(spath+name, index=False)

    print(f'[DONE] {name}')

#The Torch Way

In [None]:
global global_config
global_config = GlobalConfig(3)

##Run folds

In [None]:
gc.collect()

196

In [None]:
%%time
run_fold()

# CrossVal Score

In [None]:
cvl = CrossValLogger(train, 'Acc', n_folds, path=f'evals/{model_name}/')

In [None]:
score1, score2 = cvl.show_results(True)

OOF_CV_SCORE: 0.81748 | OVR_SCORE: 0.81754


In [None]:
gc.collect()

15

# Prediction

In [None]:
%%time
preds = run_prediction()

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


CPU times: user 4min 43s, sys: 2min 46s, total: 7min 29s
Wall time: 8min 41s


In [None]:
raw_outputs = np.mean(preds, axis=0)
outputs = np.argmax(raw_outputs, axis=1)

#Submission

In [None]:
spath = 'submissions/'
wm.create_dir(spath)

In [None]:
csv = '{}_{}'.format(model_name.split('/')[-1], '10WwJdQcXs')

preds_dict = {
    csv+'.csv': outputs,
    csv+'_raw_outputs.csv': raw_outputs
}

submit(preds_dict)

[DONE] roberta-base_10WwJdQcXs.pkl_0.81748-0.81754.csv
[DONE] roberta-base_10WwJdQcXs.pkl_0.81748-0.81754_raw_outputs.csv


In [None]:
submission = pd.read_csv(spath+csv+'.csv')

In [None]:
submission.head()

Unnamed: 0,ID,label
0,2DDHQW9,-1
1,5HY6UEY,-1
2,ATNVUJX,1
3,Q9XYVOQ,-1
4,TOAHLRH,1


In [None]:
submission.label.value_counts(normalize=True)

-1    0.502333
 1    0.489000
 0    0.008667
Name: label, dtype: float64

In [None]:
wm.clear(['evals/', 'models/'])