In [1]:
!pip install transformers



In [2]:
import torch
import numpy as np
import pandas as pd
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers.tokenization_bert import BertTokenizer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers.modeling_bert import BertForSequenceClassification

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels_to_classify = 2
max_seq_length = 256
batch_size = 16
MBERT_MODEL = "bert-base-multilingual-uncased"

In [4]:
device

device(type='cuda')

In [5]:
tokenizer = BertTokenizer.from_pretrained(MBERT_MODEL)

model = BertForSequenceClassification.from_pretrained(
    MBERT_MODEL, 
    num_labels = num_labels_to_classify
    ).to(device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [6]:
class BertInputItem(object):
  def __init__(self, text, input_ids, input_mask, segment_ids, label_ids):
    self.text = text
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids

def convert_examples_to_inputs(example_premise, example_hypothesis, 
                               example_label, max_seq_length, 
                               tokenizer, verbose=0):
  input_items = []
  examples = zip(example_premise, example_hypothesis, example_label)
  for ex_idx, (text_p, text_h, label) in enumerate(tqdm(examples)):
    # create list of token ids
    #input_ids = tokenizer.encode(f'[CLS] {text_p} [SEP] {text_h} [SEP]')

    encoded_cls = tokenizer.encode('[CLS]')
    encoded_sep = tokenizer.encode('[SEP]')

    input_ids_p = encoded_cls + tokenizer.encode(f'{text_p}') + encoded_sep
    input_ids_h = tokenizer.encode(f'{text_h}') + encoded_sep

    input_ids = input_ids_p + input_ids_h
    segment_ids = [0] * len(input_ids_p) + [1] * len(input_ids_h)
    input_mask = [1] * len(input_ids)

    if len(input_ids) > max_seq_length:
      input_ids = input_ids[:max_seq_length]
      segment_ids = segment_ids[:max_seq_length]
      input_mask = input_mask[:max_seq_length]


    #segment_ids = [0] * len(input_ids)

    #input_mask = [1] * len(input_ids)

    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    text = text_p + ' [SEP] ' + text_h

    input_items.append(
        BertInputItem(text=text,
                      input_ids=input_ids,
                      input_mask=input_mask,
                      segment_ids=segment_ids,
                      label_ids=label
                      )
    )
  return input_items

In [7]:
def get_data_loader(features, max_seq_length, batch_size=batch_size, shuffle=True):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
  all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
  data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

  data_loader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
  return data_loader

In [8]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
train_path = 'gdrive/My Drive/data/csv/ru_nli_train.csv'
val_path = 'gdrive/My Drive/data/csv/New_ru_bool/train.csv'
test_path = 'gdrive/My Drive/data/csv/New_ru_bool/val.csv'

train = pd.read_csv(train_path)
val = pd.read_csv(val_path, )
test = pd.read_csv(test_path)

In [10]:
train.head()

Unnamed: 0,language,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,promptID,pairID,genre,label1,label2,label3,label4,label5,sentence1_tokenized,sentence2_tokenized,match
0,ru,neutral,,,,,"И он сказал: Мама, я дома.","Он позвал маму, как только вышел из школьного ...",1,1,facetoface,neutral,contradiction,neutral,neutral,neutral,"И он сказал : Мама , я дома .","Он позвал маму , как только вышел из школьного...",True
1,ru,contradiction,,,,,"И он сказал: Мама, я дома.",Он не произнес ни слова.,1,2,facetoface,contradiction,contradiction,contradiction,contradiction,contradiction,"И он сказал : Мама , я дома .",Он не произнес ни слова .,True
2,ru,entailment,,,,,"И он сказал: Мама, я дома.","Он сказал матери, что уже добрался домой.",1,3,facetoface,entailment,entailment,neutral,entailment,entailment,"И он сказал : Мама , я дома .","Он сказал матери , что уже добрался домой .",True
3,ru,neutral,,,,,"Я не знал, что мне предстояло сделать и все та...","Я раньше не был в Вашингтоне, поэтому, получив...",6,16,facetoface,neutral,neutral,neutral,neutral,neutral,"Я не знал , что мне предстояло сделать и все т...","Я раньше не был в Вашингтоне , поэтому , получ...",True
4,ru,contradiction,,,,,"Я не знал, что мне предстояло сделать и все та...","Я точно знал, что мне нужно сделать, когда вхо...",6,17,facetoface,contradiction,contradiction,contradiction,contradiction,contradiction,"Я не знал , что мне предстояло сделать и все т...","Я точно знал , что мне нужно сделать , когда в...",True


In [11]:
val.head()

Unnamed: 0,passage,question,answer,idx
0,Успешное выполнение программы полёта «Аполлона...,была ли высадка на луну,True,0
1,Ива́н Алекса́ндрович Хлестако́в — главный геро...,был ли хлестаков ревизором,False,1
2,Украинское законодательство допускает владение...,разрешено ли автоматическое оружие в украине,True,2
3,(1) Мальчишка разбил стекло. – (2)Кто это сдел...,"Всегда ли мальчишка, разбивший стекло, труслив?",False,3
4,«День Конституции» — празднование принятия Кон...,был ли 12 декабря выходным днем,True,4


In [31]:
test.head()

Unnamed: 0,passage,question,answer,idx
0,В его основе — всё те же легенды о святом Нико...,есть ли дед мороз,0,0
1,"Троекуров подкупает губернский суд и, пользуяс...",был ли дубровский разбойником,1,1
2,"Репарация — особая функция клеток, заключающа...",был исследован участок цепи молекулы днк,1,2
3,.su — национальный домен верхнего уровня для ...,был ли интернет в ссср,1,3
4,"(1) Оставшись в одиночестве, считая своё полож...",Приняли ли Тома в деревне туземцев?,1,4


In [12]:
test_mapping = {False: 0, True: 1}
train_map = {'neutral':1, 'contradiction': 0, 'entailment':1}
train['gold_label'].replace(train_map, inplace=True)
test['answer'].replace(test_mapping, inplace=True)
val['answer'].replace(test_mapping, inplace=True)

In [13]:
train.shape, val.shape, test.shape

((2490, 19), (392, 4), (295, 4))

In [14]:
train.head(3)

Unnamed: 0,language,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,promptID,pairID,genre,label1,label2,label3,label4,label5,sentence1_tokenized,sentence2_tokenized,match
0,ru,1,,,,,"И он сказал: Мама, я дома.","Он позвал маму, как только вышел из школьного ...",1,1,facetoface,neutral,contradiction,neutral,neutral,neutral,"И он сказал : Мама , я дома .","Он позвал маму , как только вышел из школьного...",True
1,ru,0,,,,,"И он сказал: Мама, я дома.",Он не произнес ни слова.,1,2,facetoface,contradiction,contradiction,contradiction,contradiction,contradiction,"И он сказал : Мама , я дома .",Он не произнес ни слова .,True
2,ru,1,,,,,"И он сказал: Мама, я дома.","Он сказал матери, что уже добрался домой.",1,3,facetoface,entailment,entailment,neutral,entailment,entailment,"И он сказал : Мама , я дома .","Он сказал матери , что уже добрался домой .",True


In [15]:
train.tail(3)

Unnamed: 0,language,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,promptID,pairID,genre,label1,label2,label3,label4,label5,sentence1_tokenized,sentence2_tokenized,match
2487,ru,1,,,,,При том что утверждение является лучшим вариан...,Заявление дает более подробную информацию.,2498,7492,verbatim,neutral,entailment,neutral,neutral,contradiction,При том что утверждение является лучшим вариан...,Заявление дает более подробную информацию .,True
2488,ru,0,,,,,При том что утверждение является лучшим вариан...,Заявление не лучше.,2498,7493,verbatim,contradiction,contradiction,contradiction,contradiction,contradiction,При том что утверждение является лучшим вариан...,Заявление не лучше .,True
2489,ru,1,,,,,При том что утверждение является лучшим вариан...,Желательно обозначить утверждение.,2498,7494,verbatim,entailment,entailment,entailment,entailment,entailment,При том что утверждение является лучшим вариан...,Желательно обозначить утверждение .,True


In [16]:
test.head(3)

Unnamed: 0,passage,question,answer,idx
0,В его основе — всё те же легенды о святом Нико...,есть ли дед мороз,0,0
1,"Троекуров подкупает губернский суд и, пользуяс...",был ли дубровский разбойником,1,1
2,"Репарация — особая функция клеток, заключающа...",был исследован участок цепи молекулы днк,1,2


In [17]:
n_samples_train = train.shape[0]

In [18]:
train_features = convert_examples_to_inputs(train['sentence1'].values, 
                                            train['sentence2'].values,
                                            train['gold_label'].values,
                                            max_seq_length,
                                            tokenizer)

val_features = convert_examples_to_inputs(val['passage'].values, 
                                          val['question'].values,
                                          val['answer'].values,
                                          max_seq_length,
                                          tokenizer)

test_features = convert_examples_to_inputs(test['passage'].values, 
                                          test['question'].values,
                                          test['answer'].values,
                                          max_seq_length,
                                          tokenizer)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (847 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (701 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi




In [19]:
train_dataloader = get_data_loader(train_features, max_seq_length, shuffle=True)
val_dataloader = get_data_loader(val_features, max_seq_length, shuffle=True)
test_dataloader = get_data_loader(test_features, max_seq_length, shuffle=True)

In [20]:
def evaluate(model, dataloader, device='cpu'):
  model.eval()

  eval_loss = 0
  number_eval_steps = 0
  pred_labels, true_labels = [], []

  model.to(device)
  for step, batch in enumerate(tqdm(dataloader, desc='Eval')):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, segment_ids, label_ids = batch

    with torch.no_grad():
      tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                token_type_ids=segment_ids, labels=label_ids)
    
    outputs = np.argmax(logits.to('cpu'), axis=1)
    label_ids = label_ids.to('cpu').numpy()

    pred_labels += list(outputs)
    true_labels += list(label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    number_eval_steps += 1

  eval_loss /= number_eval_steps

  true_labels = np.array(true_labels)
  pred_labels = np.array(pred_labels)

  return eval_loss, true_labels, pred_labels

In [21]:
def metrics(y_true, y_preds):
  acc = accuracy_score(y_true, y_preds)
  precision = precision_score(y_true, y_preds)
  recall = recall_score(y_true, y_preds)
  f1 = f1_score(y_true, y_preds)
  return acc, precision, recall, f1

In [22]:
loss, true, pred = evaluate(model, test_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


Loss: 0.6850934279592413;  Acc: 0.6915254237288135;  Precision: 0.7529411764705882;  Recall: 0.8727272727272727;  F1: 0.808421052631579


In [23]:
print('model predictions: ', pred)

model predictions:  [1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1
 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1
 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1]


In [24]:
def train(model, train_dataloader, val_dataloader, 
          output_model_file = '/tmp/mbert.bin', num_train_epochs=1, 
          patience=2, gradient_accumulation_steps=1, max_grad_norm=5, 
          warmup_proportion=0.1, batch_size=batch_size, learning_rate=5e-5):
  
  num_train_steps = int(n_samples_train / batch_size / gradient_accumulation_steps * num_train_epochs)
  num_warmup_steps = int(warmup_proportion * num_train_steps)
  
  param_optim = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optim_grouped_params = [
    {'params': [p for n, p in param_optim if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optim if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0} 
  ]

  optimizer = AdamW(optim_grouped_params, lr=learning_rate, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=num_warmup_steps, 
                                              num_training_steps=num_train_steps
                                              )
  
  loss_history = []
  no_improvement = 0
  for epoch in range(num_train_epochs):
    print(f'===== Epoch: {epoch} =====')

    model.train()
    train_loss = 0
    num_train_examples, num_train_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc='Train iter')):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      outputs = model(input_ids, attention_mask=input_mask, 
                      token_type_ids=segment_ids, labels=label_ids)
      
      loss = outputs[0]
      if gradient_accumulation_steps > 1:
        loss /= gradient_accumulation_steps
      loss.backward()
      train_loss += loss.item() 

      if (step + 1) % gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    val_loss, true, pred = evaluate(model, val_dataloader, device=device)
    print("loss history: ", loss_history)
    acc, precision, recall, f1 = metrics(true, pred)
    print(f'Val loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

    if len(loss_history) == 0 or val_loss < min(loss_history):
      no_improvement = 0
      model_to_save = model.module if hasattr(model, 'module') else model
      torch.save(model_to_save.state_dict(), output_model_file)
    else:
      no_improvement += 1
    
    if no_improvement >= patience:
      print(f'No improvement of val set for {patience} epochs')
      break
    loss_history.append(val_loss)

  return output_model_file, model

In [25]:
torch.cuda.empty_cache()
model_mbert = BertForSequenceClassification.from_pretrained(
    MBERT_MODEL, 
    num_labels=num_labels_to_classify
    )
model_mbert.to(device)
model_file_name, model_ru = train(model_mbert, 
                                  train_dataloader, 
                                  val_dataloader, 
                                  num_train_epochs=2,
                                  gradient_accumulation_steps=4,
                                  )

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

===== Epoch: 0 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=156.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=25.0, style=ProgressStyle(description_width='i…


loss history:  []
Val loss: 0.20265641808509827;  Acc: 0.7755102040816326;  Precision: 0.7755102040816326;  Recall: 1.0;  F1: 0.8735632183908045
===== Epoch: 1 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=156.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=25.0, style=ProgressStyle(description_width='i…


loss history:  [0.5838831639289856]
Val loss: 0.1734691709280014;  Acc: 0.7755102040816326;  Precision: 0.7755102040816326;  Recall: 1.0;  F1: 0.8735632183908045


In [26]:
loss, true, pred = evaluate(model_mbert, test_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


Loss: 0.5755522894231897;  Acc: 0.7457627118644068;  Precision: 0.7457627118644068;  Recall: 1.0;  F1: 0.8543689320388349


In [27]:
print('model predictions: ', pred)

model predictions:  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [28]:
torch.cuda.empty_cache()
model_mbert = BertForSequenceClassification.from_pretrained(
    MBERT_MODEL, 
    num_labels=num_labels_to_classify
    )
model_mbert.to(device)
model_file_name, model_ru = train(model_mbert, 
                                  val_dataloader, 
                                  test_dataloader, 
                                  num_train_epochs=3,
                                  gradient_accumulation_steps=4,
                                  )

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

===== Epoch: 0 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=25.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


loss history:  []
Val loss: 0.14460758864879608;  Acc: 0.7457627118644068;  Precision: 0.7457627118644068;  Recall: 1.0;  F1: 0.8543689320388349
===== Epoch: 1 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=25.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


loss history:  [0.5890337793450606]
Val loss: 0.06663606315851212;  Acc: 0.7457627118644068;  Precision: 0.7457627118644068;  Recall: 1.0;  F1: 0.8543689320388349
===== Epoch: 2 =====


HBox(children=(FloatProgress(value=0.0, description='Train iter', max=25.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


loss history:  [0.5890337793450606, 0.5582394897937775]
Val loss: 0.14581742882728577;  Acc: 0.7457627118644068;  Precision: 0.7457627118644068;  Recall: 1.0;  F1: 0.8543689320388349


In [29]:
loss, true, pred = evaluate(model_mbert, test_dataloader, device)
acc, precision, recall, f1 = metrics(true, pred)
print(f'Loss: {loss};  Acc: {acc};  Precision: {precision};  Recall: {recall};  F1: {f1}' )

HBox(children=(FloatProgress(value=0.0, description='Eval', max=19.0, style=ProgressStyle(description_width='i…


Loss: 0.5762660315162257;  Acc: 0.7457627118644068;  Precision: 0.7457627118644068;  Recall: 1.0;  F1: 0.8543689320388349


In [30]:
print('model predictions: ', pred)

model predictions:  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
