In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cd/38/c9527aa055241c66c4d785381eaf6f80a28c224cae97daa1f8b183b5fabb/transformers-2.9.0-py3-none-any.whl (635kB)
[K     |████████████████████████████████| 645kB 2.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.0MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 16.1MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/3b/88/49e772d686088e1278766ad68a463513642a2a877487decbd691dec02955/sentencepiece-0.1.90-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████

In [0]:
import torch
import numpy as np
import pandas as pd
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers.tokenization_bert import BertTokenizer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers.modeling_bert import BertForSequenceClassification

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels_to_classify = 2
max_seq_length = 128
RuBERT_MODEL = "DeepPavlov/rubert-base-cased"


In [4]:
device

device(type='cuda')

In [5]:
tokenizer = BertTokenizer.from_pretrained(RuBERT_MODEL)

model = BertForSequenceClassification.from_pretrained(
    RuBERT_MODEL, 
    num_labels = num_labels_to_classify
    )
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [0]:
class BertInputItem(object):
  def __init__(self, text, input_ids, input_mask, segment_ids, label_ids):
    self.text = text
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids

def convert_examples_to_inputs(example_text, example_label, max_seq_length, 
                               tokenizer, verbose=0):
  input_items = []
  examples = zip(example_text, example_label)
  for ex_idx, (text, label) in enumerate(tqdm(examples)):
    # create list of token ids
    input_ids = tokenizer.encode(f'[CLS] {text} [SEP]')
    if len(input_ids) > max_seq_length:
      input_ids = input_ids[:max_seq_length]
    
    segment_ids = [0] * len(input_ids)

    input_mask = [1] * len(input_ids)

    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    input_items.append(
        BertInputItem(text=text,
                      input_ids=input_ids,
                      input_mask=input_mask,
                      segment_ids=segment_ids,
                      label_ids=label
                      )
    )
  return input_items

In [0]:
def get_data_loader(features, max_seq_length, batch_size=64, shuffle=True):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
  all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
  data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

  data_loader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
  return data_loader

In [8]:
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
train_path = 'gdrive/My Drive/data/csv/train.csv'
val_path = 'gdrive/My Drive/data/csv/val.csv'
test_path = 'gdrive/My Drive/data/csv/test.csv'

train_en_path = 'gdrive/My Drive/data/csv/train_en_tweet.csv'

train = pd.read_csv(train_path)
train_en = pd.read_csv(train_en_path)
val = pd.read_csv(val_path)
test = pd.read_csv(test_path)

In [10]:
train.head()

Unnamed: 0,text,label
0,RT @lizochka_8882: Ребят я влюбилась в него(((...,1
1,"RT @maximus880: @koteikina_me ай ,кошечка ,Арр...",0
2,"@VodVodyasova мы скучашки всеее)давай,борись,в...",0
3,"@AppleJesus исправьте цену на iPhone 5C 32Gb, ...",0
4,RT @tezyjojykyc: ну и кто сколько пожертвовал ...,0


In [11]:
train_en.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,is so sad for my APL frie...,1
1,1,I missed the New Moon trail...,1
2,2,omg its already 7:30 :O,0
3,3,.. Omgaga. Im sooo im gunna CRy. I'...,1
4,4,i think mi bf is cheating on me!!! ...,1


In [0]:
n_samples_train = train.shape[0]
n_samples_train_en = train_en.shape[0]

In [15]:
train_features = convert_examples_to_inputs(train['text'].values, 
                                            train['label'].values,
                                            max_seq_length,
                                            tokenizer)

val_features = convert_examples_to_inputs(val['text'].values, 
                                          val['label'].values,
                                          max_seq_length,
                                          tokenizer)

test_features = convert_examples_to_inputs(test['text'].values, 
                                          test['label'].values,
                                          max_seq_length,
                                          tokenizer)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
train_ru_dataloader = get_data_loader(train_features, max_seq_length, shuffle=True)
val_ru_dataloader = get_data_loader(val_features, max_seq_length, shuffle=True)
test_ru_dataloader = get_data_loader(test_features, max_seq_length, shuffle=True)

In [0]:
def evaluate(model, dataloader, device='cpu'):
  model.eval()

  eval_loss = 0
  number_eval_steps = 0
  pred_labels, true_labels = [], []

  model.to(device)
  for step, batch in enumerate(tqdm(dataloader, desc='Eval')):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, segment_ids, label_ids = batch

    with torch.no_grad():
      tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                token_type_ids=segment_ids, labels=label_ids)
    
    outputs = np.argmax(logits.to('cpu'), axis=1)
    label_ids = label_ids.to('cpu').numpy()

    pred_labels += list(outputs)
    true_labels += list(label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    number_eval_steps += 1

  eval_loss /= number_eval_steps

  true_labels = np.array(true_labels)
  pred_labels = np.array(pred_labels)

  return eval_loss, true_labels, pred_labels

In [0]:
def metrics(y_true, y_preds):
  acc = accuracy_score(y_true, y_preds)
  precision = precision_score(y_true, y_preds)
  recall = recall_score(y_true, y_preds)
  f1 = f1_score(y_true, y_preds)
  return acc, precision, recall, f1

## pretrained M-BERT  Rus Classification


In [20]:
loss, true, pred = evaluate(model, test_ru_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=355.0, style=ProgressStyle(description_width='…


Loss: 0.6929283221003035;  Acc: 0.5097866337506612;  Precision: 0.7086330935251799;  Recall: 0.017532929868280527;  F1: 0.034219211394823694


In [0]:
def train(model, train_dataloader, val_dataloader, 
          output_model_file = '/tmp/mbert.bin', num_train_epochs=20, 
          patience=4, gradient_accumulation_steps=1, max_grad_norm=5, 
          warmup_proportion=0.1, batch_size=64, learning_rate=5e-5):
  
  num_train_steps = int(n_samples_train_en / batch_size / gradient_accumulation_steps * num_train_epochs)
  num_warmup_steps = int(warmup_proportion * num_train_steps)
  
  param_optim = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optim_grouped_params = [
    {'params': [p for n, p in param_optim if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optim if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0} 
  ]

  optimizer = AdamW(optim_grouped_params, lr=learning_rate, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=num_warmup_steps, 
                                              num_training_steps=num_train_steps
                                              )
  
  loss_history = []
  no_improvement = 0
  for epoch in range(num_train_epochs):
    print(f'===== Epoch: {epoch} =====')

    model.train()
    train_loss = 0
    num_train_examples, num_train_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train iter')):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      outputs = model(input_ids, attention_mask=input_mask, 
                      token_type_ids=segment_ids, labels=label_ids)
      
      loss = outputs[0]
      if gradient_accumulation_steps > 1:
        loss /= gradient_accumulation_steps
      loss.backward()
      train_loss += loss.item() 

      if (step + 1) % gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    val_loss, true, pred = evaluate(model, val_dataloader, device=device)
    print("loss history: ", loss_history)
    acc, precision, recall, f1 = metrics(true, pred)
    print(f'Val loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

    if len(loss_history) == 0 or val_loss < min(loss_history):
      no_improvement = 0
      model_to_save = model.module if hasattr(model, 'module') else model
      torch.save(model_to_save.state_dict(), output_model_file)
    else:
      no_improvement += 1
    
    if no_improvement >= patience:
      print(f'No improvement of val set for {patience} epochs')
      break
    loss_history.append(val_loss)

  return output_model_file, model

# RuBERT fine-tuned on russian data to russian

In [22]:
torch.cuda.empty_cache()
model_ru = BertForSequenceClassification.from_pretrained(
    RuBERT_MODEL, 
    num_labels=num_labels_to_classify
    )
model_ru.to(device)
model_file_name, model_ru = train(model_ru, 
                                  train_ru_dataloader, 
                                  val_ru_dataloader, 
                                  gradient_accumulation_steps=4,
                                  num_train_epochs=1
                                  )

===== Epoch: 0 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=2836.0, style=ProgressStyle(description_…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)





HBox(children=(FloatProgress(value=0.0, description='Eval', max=355.0, style=ProgressStyle(description_width='…


loss history:  []
Val loss: 9.965454955818132e-05;  Acc: 0.999030110655557;  Precision: 1.0;  Recall: 0.998017839444995;  F1: 0.9990079365079365


In [23]:
 loss, true, pred = evaluate(model_ru, test_ru_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=355.0, style=ProgressStyle(description_width='…


Loss: 0.0036176228258778457;  Acc: 0.9992946570269794;  Precision: 1.0;  Recall: 0.9985760056959773;  F1: 0.9992874955468471
