#Specs

In [1]:
!nvidia-smi

Tue Mar 30 19:47:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Install

In [2]:
!pip install transformers torchcontrib catalyst -q

[K     |████████████████████████████████| 2.0MB 7.7MB/s 
[K     |████████████████████████████████| 471kB 36.3MB/s 
[K     |████████████████████████████████| 3.2MB 51.7MB/s 
[K     |████████████████████████████████| 890kB 51.8MB/s 
[K     |████████████████████████████████| 317kB 55.5MB/s 
[K     |████████████████████████████████| 645kB 48.8MB/s 
[?25h  Building wheel for torchcontrib (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


#Imports

In [4]:
import warnings
warnings.simplefilter('ignore')

In [5]:
import os, sys, gc
import time
import string
import subprocess
import numpy as np
import pandas as pd

In [6]:
from tqdm.notebook import tqdm

In [7]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, f1_score, accuracy_score
from sklearn.metrics import classification_report

In [8]:
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [9]:
import transformers
from transformers import AutoConfig, AutoTokenizer

In [10]:
from nlp.all import *

#Envs

In [11]:
seed = 1999
n_folds = 5
model_name = 'bert-base-multilingual-cased'
config_name = model_name
tok_name = model_name
dirs = ['models/', f'evals/{model_name}']
exts = ['.csv', '.pkl']

In [12]:
wm = WorkplaceManager(seed, dirs, exts, n_folds)

Set seed to 1999.
Created models/ evals/bert-base-multilingual-cased


#Tokenizer

In [15]:
config = AutoConfig.from_pretrained(config_name)
tokenizer = getTokenizer(config, tok_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [16]:
tokenizer.pad_token

'[PAD]'

#Data

In [None]:
path = 'data/'

In [None]:
train = pd.read_csv(path+'TrainNormalized.csv')
test = pd.read_csv(path+'TestNormalized.csv')
sample = pd.read_csv(path+'SampleSubmission.csv')

In [None]:
train.shape

(70000, 4)

In [None]:
train.head()

Unnamed: 0,ID,text,length,label
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,23,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,24,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,100,-1
3,U0TTYY8,ak slouma,6,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,63,-1


In [None]:
train['length'] = train['text'].apply(lambda x: len(tokenizer.encode(x)))
test['length'] = test['text'].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
train.head()

Unnamed: 0,ID,text,length,label
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,21,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,21,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,81,-1
3,U0TTYY8,ak slouma,5,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,52,-1


In [None]:
train['target'] = train['label'].values
train['label'] += 1

In [None]:
train.label.value_counts(normalize=True)

2    0.546271
0    0.418500
1    0.035229
Name: label, dtype: float64

In [None]:
train.length.describe([.25, .5, .75, .8, .85, .9, .95])

count    70000.000000
mean        21.798729
std         18.086394
min          4.000000
25%         10.000000
50%         15.000000
75%         26.000000
80%         31.000000
85%         37.000000
90%         47.000000
95%         68.000000
max        106.000000
Name: length, dtype: float64

#Split in folds

In [None]:
train.head()

Unnamed: 0,ID,text,length,label,target
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,21,0,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,21,0,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,81,0,-1
3,U0TTYY8,ak slouma,5,2,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,52,0,-1


In [None]:
kf = StratifiedKFold(n_folds)
train['fold'] = 0
for fold, (tr_index, val_index) in enumerate(kf.split(train.index, train.label)):
  train.loc[val_index, 'fold'] = fold

In [None]:
train.head()

Unnamed: 0,ID,text,length,label,target,fold
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,21,0,-1,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,21,0,-1,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,81,0,-1,0
3,U0TTYY8,ak slouma,5,2,1,0
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,52,0,-1,0


#Utilities

In [None]:
class GlobalConfig:
  id = 0
  task = 'train'

  accumulate_grad_batches = 1
  activation = torch.softmax
  batch_size = 32
  warmup_steps = 0

  finetune_epochs = 0
  epochs = 5
  folds = n_folds

  low_dropout = 0.1
  high_dropout = 0.1
  use_bucketing = True
  
  loss = Loss('ce')
  loss_name = 'ce'
  metric_name = 'Acc'
  
  lr = 4e-5
  head_lr = 1e-3
  scheduler = False
  swa = False
  swa_start = 4
  swa_freq = 1
  swa_lr = 3e-5

  clip_grad = False
  max_grad_norm = 1.0
  max_tokens = 100
  on_batch = True

  model_name = model_name
  config_name = config_name
  pretrained = True

  n_classes = 3

  def __init__(self, xp_nb=1):
    self.xp_nb = xp_nb
    self._reset()

  def update(self, **kwargs):
    for name in kwargs.keys():
      if hasattr(self, name):
        setattr(self, name, kwargs[name])

  def _reset(self):
    self.fold = 0
    self.train_df = None
    self.test_df = None
    self.val_df = None

  def save(self, path=''):
    self._reset()
    self._name()
    
    with open(path+self.name, 'wb') as f:
      pickle.dump(self, f)

  def _name(self):
    n = ''
    if not hasattr(self, 'name'):
      vocab = string.ascii_letters + string.digits
      for i in range(self.xp_nb):
        n = ''
        for _ in range(10):
          n += vocab[random.randint(0, len(vocab)-1)]

      self.name = n+'.pkl'
    print(self.name)

In [None]:
def run_fold(score_pkl='cv_score.pkl'):
  torch.cuda.empty_cache()
  
  fold_scores = []
  print_dict_list = {}

  try:
    with open(score_pkl, 'rb') as f:
      cv_score = pickle.load(f)
      f.close()
  except FileNotFoundError:
    cv_score = 0

  completed = len(os.listdir('models/'))

  for fold in train.fold.unique():
    if fold < completed: continue

    print('Fold', fold)

    df_train = train[train.fold != fold].copy()
    df_val = train[train.fold == fold].copy()
    print_dict_list[fold] = []
    best_score = np.inf

    global_config.update(
      fold = fold,
      train_df = df_train,
      val_df = df_val,
      test_df = pd.DataFrame(),
    )


    trainer = Trainer(global_config)
    es = EarlyStopping()

    trainer.fit()

    trainer.save_best_eval()

    cv_score += trainer.best_metric

    os.system(f"rm {score_pkl}")
    with open(score_pkl, 'wb') as f:
      pickle.dump(cv_score, f)
      f.close()

    trainer = None
    del trainer, df_train, df_val
    gc.collect()

In [None]:
def run_prediction(path='models/'):
  torch.cuda.empty_cache()
  
  preds = []
  global_config.update(
      task='test',
      train_df = test,
      val_df = test,
      test_df=test
    )

  for fold in tqdm(train.fold.unique()):

    module = LightTrainingModule(global_config)
    module.load_state_dict(torch.load(path+f'model_{fold}.bin'))
    trainer = Trainer(global_config, module=module)
    
    trainer.predict()
    pred = trainer.get_preds()

    preds.append(pred)

    module, trainer = None, None
    del trainer, module
    gc.collect()

  return preds

In [None]:
def submit(preds_dict):
  for name, preds in preds_dict.items():
    submission = test[['ID']]
    submission['label'] = preds

    submission.to_csv(spath+name, index=False)

    print(f'[DONE] {name}')

#The Torch Way

In [None]:
global global_config
global_config = GlobalConfig(2)

##Run folds

In [None]:
gc.collect()

241

In [None]:
%%time
run_fold()

____________________________________________________________________________________________________ 
Fold  4
____________________________________________________________________________________________________
| ⏰ 05:53 | Epoch: 0 - Loss: 0.57901 - ValLoss: 0.52805 - Logloss: 0.52805 - F1: 0.77249 - Acc: 0.77890
____________________________________________________________________________________________________
| ⏰ 05:52 | Epoch: 1 - Loss: 0.47852 - ValLoss: 0.47448 - Logloss: 0.47448 - F1: 0.80241 - Acc: 0.80487
____________________________________________________________________________________________________
| ⏰ 05:52 | Epoch: 2 - Loss: 0.41336 - ValLoss: 0.46508 - Logloss: 0.46508 - F1: 0.80578 - Acc: 0.81172
____________________________________________________________________________________________________
| ⏰ 05:53 | Epoch: 3 - Loss: 0.35117 - ValLoss: 0.44887 - Logloss: 0.44887 - F1: 0.81417 - Acc: 0.81728
______________________________________________________________________

# CrossVal Score

In [None]:
cvl = CrossValLogger(train, 'Acc', n_folds, path=f'evals/{model_name}/')

In [None]:
score1, score2 = cvl.show_results(True)

OOF_CV_SCORE: 0.81729 | OVR_SCORE: 0.48003


In [None]:
gc.collect()

721

# Prediction

In [None]:
%%time
preds = run_prediction()

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


CPU times: user 6min 48s, sys: 5min 41s, total: 12min 30s
Wall time: 14min


In [None]:
raw_outputs = np.mean(preds, axis=0)
outputs = np.argmax(raw_outputs, axis=1) - 1

#Submission

In [13]:
spath = 'submissions/'
wm.create_dir(spath)

In [None]:
csv = '{}_{}'.format(model_name.split('/')[-1], 'AH7LwUXCvT')

preds_dict = {
    csv+'.csv': outputs,
    csv+'_raw_outputs.csv': list(map(str, raw_outputs))
}

submit(preds_dict)

[DONE] bert-base-multilingual-cased_AH7LwUXCvT.pkl_0.81729-0.48003.csv
[DONE] bert-base-multilingual-cased_AH7LwUXCvT.pkl_0.81729-0.48003_raw_outputs.csv


In [None]:
submission = pd.read_csv(spath+csv+'.csv')

In [None]:
submission.head()

Unnamed: 0,ID,label
0,2DDHQW9,-1
1,5HY6UEY,-1
2,ATNVUJX,1
3,Q9XYVOQ,-1
4,TOAHLRH,1


In [None]:
submission.label.value_counts(normalize=True)

-1    0.506167
 1    0.486333
 0    0.007500
Name: label, dtype: float64

In [14]:
wm.clear(['evals/', 'models/'])