In [1]:
!pip install transformers



In [0]:
import torch
import numpy as np
import pandas as pd
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers.tokenization_bert import BertTokenizer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers.modeling_bert import BertForSequenceClassification

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels_to_classify = 2
max_seq_length = 256
batch_size = 16
RuBERT_MODEL = "DeepPavlov/rubert-base-cased"

In [4]:
device

device(type='cuda')

In [5]:
tokenizer = BertTokenizer.from_pretrained(RuBERT_MODEL)

model = BertForSequenceClassification.from_pretrained(
    RuBERT_MODEL, 
    num_labels = num_labels_to_classify
    )
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [0]:
class BertInputItem(object):
  def __init__(self, text, input_ids, input_mask, segment_ids, label_ids):
    self.text = text
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids

def convert_examples_to_inputs(example_premise, example_hypothesis, 
                               example_label, max_seq_length, 
                               tokenizer, verbose=0):
  input_items = []
  examples = zip(example_premise, example_hypothesis, example_label)
  for ex_idx, (text_p, text_h, label) in enumerate(tqdm(examples)):
    # create list of token ids
    input_ids = tokenizer.encode(f'[CLS] {text_p} [SEP] {text_h} [SEP]')
    if len(input_ids) > max_seq_length:
      input_ids = input_ids[:max_seq_length]
    
    segment_ids = [0] * len(input_ids)

    input_mask = [1] * len(input_ids)

    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    text = text_p + ' [SEP] ' + text_h

    input_items.append(
        BertInputItem(text=text,
                      input_ids=input_ids,
                      input_mask=input_mask,
                      segment_ids=segment_ids,
                      label_ids=label
                      )
    )
  return input_items

In [0]:
def get_data_loader(features, max_seq_length, batch_size=batch_size, shuffle=True):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
  all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
  data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

  data_loader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
  return data_loader

In [8]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
train_path = 'gdrive/My Drive/data/csv/ru_nli_test.csv'
val_path = 'gdrive/My Drive/data/csv/ru_nli_train.csv'
test_path = 'gdrive/My Drive/data/csv/test_boolQ.csv'

train = pd.read_csv(train_path)
val = pd.read_csv(val_path)
test = pd.read_csv(test_path)

In [0]:
#mapping = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
mapping = {'contradiction': 0, 'entailment': 1, 'neutral': 0}
train['gold_label'].replace(mapping, inplace=True)
val['gold_label'].replace(mapping, inplace=True)

test_mapping = {False: 0, True: 1}
test['answer'].replace(test_mapping, inplace=True)

In [11]:
train.shape

(5010, 19)

In [12]:
train.head(3)

Unnamed: 0,language,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,promptID,pairID,genre,label1,label2,label3,label4,label5,sentence1_tokenized,sentence2_tokenized,match
0,ru,0,,,,,"Ну, я даже не думал об этом, но я был так разо...",Я больше с ним не разговаривал.,2,4,facetoface,contradiction,contradiction,contradiction,contradiction,contradiction,"Ну , я даже не думал об этом , но я был так ра...",Я больше с ним не разговаривал .,True
1,ru,1,,,,,"Ну, я даже не думал об этом, но я был так разо...","Я был так расстроен, что даже снова начал с ни...",2,5,facetoface,entailment,entailment,entailment,entailment,entailment,"Ну , я даже не думал об этом , но я был так ра...","Я был так расстроен , что даже снова начал с н...",True
2,ru,0,,,,,"Ну, я даже не думал об этом, но я был так разо...",Мы прекрасно поговорили.,2,6,facetoface,neutral,neutral,neutral,neutral,neutral,"Ну , я даже не думал об этом , но я был так ра...",Мы прекрасно поговорили .,True


In [13]:
test.head(3)

Unnamed: 0,answer,answer_or,passage,question
0,0,относительно медленно,Способность к половому размножению некоторые в...,Быстро ли растут губки?
1,0,не решает всех проблем с сосудами,Существует также иной взгляд на проблему холес...,Решает ли снижение уровня холестерина все проб...
2,1,Он с необычайной лёгкостью усваивал иностранны...,Лейбниц считается одним из самых всеобъемлющих...,Был ли Лейбниц полиглотом?


In [0]:
n_samples_train = train.shape[0]

In [15]:
train_features = convert_examples_to_inputs(train['sentence1'].values, 
                                            train['sentence2'].values,
                                            train['gold_label'].values,
                                            max_seq_length,
                                            tokenizer)

val_features = convert_examples_to_inputs(val['sentence1'].values, 
                                          val['sentence2'].values,
                                          val['gold_label'].values,
                                          max_seq_length,
                                          tokenizer)

test_features = convert_examples_to_inputs(test['passage'].values, 
                                          test['question'].values,
                                          test['answer'].values,
                                          max_seq_length,
                                          tokenizer)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
train_ru_dataloader = get_data_loader(train_features, max_seq_length, shuffle=True)
val_ru_dataloader = get_data_loader(val_features, max_seq_length, shuffle=True)
test_ru_dataloader = get_data_loader(test_features, max_seq_length, shuffle=True)

In [0]:
def evaluate(model, dataloader, device='cpu'):
  model.eval()

  eval_loss = 0
  number_eval_steps = 0
  pred_labels, true_labels = [], []

  model.to(device)
  for step, batch in enumerate(tqdm(dataloader, desc='Eval')):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, segment_ids, label_ids = batch

    with torch.no_grad():
      tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                token_type_ids=segment_ids, labels=label_ids)
    
    outputs = np.argmax(logits.to('cpu'), axis=1)
    label_ids = label_ids.to('cpu').numpy()

    pred_labels += list(outputs)
    true_labels += list(label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    number_eval_steps += 1

  eval_loss /= number_eval_steps

  true_labels = np.array(true_labels)
  pred_labels = np.array(pred_labels)

  return eval_loss, true_labels, pred_labels

In [0]:
def metrics(y_true, y_preds):
  acc = accuracy_score(y_true, y_preds)
  precision = precision_score(y_true, y_preds)
  recall = recall_score(y_true, y_preds)
  f1 = f1_score(y_true, y_preds)
  return acc, precision, recall, f1

In [0]:
def train(model, train_dataloader, val_dataloader, 
          output_model_file = '/tmp/mbert.bin', num_train_epochs=20, 
          patience=4, gradient_accumulation_steps=1, max_grad_norm=5, 
          warmup_proportion=0.1, batch_size=batch_size, learning_rate=5e-5):
  
  num_train_steps = int(n_samples_train / batch_size / gradient_accumulation_steps * num_train_epochs)
  num_warmup_steps = int(warmup_proportion * num_train_steps)
  
  param_optim = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optim_grouped_params = [
    {'params': [p for n, p in param_optim if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optim if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0} 
  ]

  optimizer = AdamW(optim_grouped_params, lr=learning_rate, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=num_warmup_steps, 
                                              num_training_steps=num_train_steps
                                              )
  
  loss_history = []
  no_improvement = 0
  for epoch in range(num_train_epochs):
    print(f'===== Epoch: {epoch} =====')

    model.train()
    train_loss = 0
    num_train_examples, num_train_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train iter')):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      outputs = model(input_ids, attention_mask=input_mask, 
                      token_type_ids=segment_ids, labels=label_ids)
      
      loss = outputs[0]
      if gradient_accumulation_steps > 1:
        loss /= gradient_accumulation_steps
      loss.backward()
      train_loss += loss.item() 

      if (step + 1) % gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    val_loss, true, pred = evaluate(model, val_dataloader, device=device)
    print("loss history: ", loss_history)
    acc, precision, recall, f1 = metrics(true, pred)
    print(f'Val loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

    if len(loss_history) == 0 or val_loss < min(loss_history):
      no_improvement = 0
      model_to_save = model.module if hasattr(model, 'module') else model
      torch.save(model_to_save.state_dict(), output_model_file)
    else:
      no_improvement += 1
    
    if no_improvement >= patience:
      print(f'No improvement of val set for {patience} epochs')
      break
    loss_history.append(val_loss)

  return output_model_file, model

In [20]:
torch.cuda.empty_cache()
model_ru = BertForSequenceClassification.from_pretrained(
    RuBERT_MODEL, 
    num_labels=num_labels_to_classify
    )
model_ru.to(device)
model_file_name, model_ru = train(model_ru, 
                                  train_ru_dataloader, 
                                  val_ru_dataloader, 
                                  gradient_accumulation_steps=4,
                                  num_train_epochs=10
                                  )

===== Epoch: 0 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  []
Val loss: 0.16820742189884186;  Acc: 0.7269076305220884;  Precision: 0.6404494382022472;  Recall: 0.41204819277108434;  F1: 0.501466275659824
===== Epoch: 1 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694]
Val loss: 0.08017263561487198;  Acc: 0.6763052208835342;  Precision: 0.5833333333333334;  Recall: 0.10120481927710843;  F1: 0.17248459958932238
===== Epoch: 2 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335]
Val loss: 0.08755391836166382;  Acc: 0.6819277108433734;  Precision: 0.5736434108527132;  Recall: 0.1783132530120482;  F1: 0.27205882352941174
===== Epoch: 3 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454]
Val loss: 0.14648455381393433;  Acc: 0.7048192771084337;  Precision: 0.5648021828103683;  Recall: 0.4987951807228916;  F1: 0.5297504798464492
===== Epoch: 4 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454, 0.5797487019728391]
Val loss: 0.12377022951841354;  Acc: 0.7321285140562249;  Precision: 0.6452762923351159;  Recall: 0.43614457831325304;  F1: 0.5204888569374551
===== Epoch: 5 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454, 0.5797487019728391, 0.5365200732380916]
Val loss: 0.03312281519174576;  Acc: 0.7365461847389558;  Precision: 0.5891393442622951;  Recall: 0.6927710843373494;  F1: 0.636766334440753
===== Epoch: 6 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454, 0.5797487019728391, 0.5365200732380916, 0.6277515532878729]
Val loss: 0.007234439253807068;  Acc: 0.7485943775100402;  Precision: 0.628463476070529;  Recall: 0.6012048192771084;  F1: 0.6145320197044335
===== Epoch: 7 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454, 0.5797487019728391, 0.5365200732380916, 0.6277515532878729, 0.6856980640918781]
Val loss: 0.001304924488067627;  Acc: 0.7285140562248996;  Precision: 0.5815677966101694;  Recall: 0.6614457831325301;  F1: 0.6189402480270575
===== Epoch: 8 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=314.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=156.0, style=ProgressStyle(description_width='…


loss history:  [0.6059691888781694, 0.6169521182011335, 0.6037711694072454, 0.5797487019728391, 0.5365200732380916, 0.6277515532878729, 0.6856980640918781, 1.0516189121378539]
Val loss: 0.0009728372097015381;  Acc: 0.7421686746987952;  Precision: 0.6073059360730594;  Recall: 0.6409638554216868;  F1: 0.6236811254396248
No improvement of val set for 4 epochs


In [21]:
loss, true, pred = evaluate(model_ru, test_ru_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=23.0, style=ProgressStyle(description_width='i…


Loss: 3.040614905564681;  Acc: 0.3926553672316384;  Precision: 0.32653061224489793;  Recall: 0.8495575221238938;  F1: 0.47174447174447176
