# This notebook contains code with sentence classificator HerBERT without fine-tuning.

In [1]:
# Necessary imports and installations
import pandas as pd
import numpy as np
!pip install transformers
import torch
from transformers import HerbertTokenizerFast, BertForMaskedLM, BertForSequenceClassification, pipeline
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

Upload data file named "data" in csv format. 





In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Treść,Kategoria
0,Księżna Diana i książę Karol pobrali się 39 la...,1
1,"Wielokrotnie zdradzana i niekochana Diana, któ...",1
2,Księżną Dianę po raz pierwszy zobaczył jesieni...,2
3,Ukończył Królewską Akademię Wojskową i został ...,2
4,"4 maja 2017 roku brytyjskie media donosiły, że...",2
...,...,...
2011,Jaguar wciąż nie atakował; cofnął się jeszcze ...,3
2012,Obaj przeciwnicy — król lasów i król mokradeł ...,3
2013,"Jaguar zamruczał zniecierpliwiony, skulił się ...",3
2014,"Kajman z kolei, zupełnie niefrasobliwy, świado...",3


In [4]:
# randomly shuffles data, it is used later for testing
df_ran = df.sample(frac=1)

In [8]:
# This class loads and prepares data to fit criteria necessary to use BERT
class ARCorpus(Dataset):
  def __init__(self, test_df):
    self.label_dict = {1: 0, 2: 1, 3: 2, 4: 3}

    self.test_df = df_ran

    self.tokenizer = HerbertTokenizerFast.from_pretrained('allegro/herbert-base-cased')
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.test_data = self.load_data(self.test_df)


  def load_data(self, df):
    MAX_LEN = 128
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    text_list = df['Treść'].to_list()
    label_list = df['Kategoria'].to_list()


    for (text, label) in zip(text_list, label_list):
      text_id = self.tokenizer.encode(text, add_special_tokens = False, max_length=510, truncation=True)
      pair_token_ids = [self.tokenizer.cls_token_id] + text_id + [self.tokenizer.sep_token_id]
      text_len = len(text_id)

      segment_ids = torch.tensor([0] * (text_len + 2) )
      attention_mask_ids = torch.tensor([1] * (text_len + 2))

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    return dataset

  def get_data_loaders(self, batch_size = 32, shuffle=True):
# Data loaders for test data 
    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )


    return test_loader

In [9]:
# Crates an object with data prepared for classification 
test_loader = ARCorpus(df_ran).get_data_loaders()

Downloading:   0%|          | 0.00/886k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/543k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/472 [00:00<?, ?B/s]

In [10]:
# Downloads a HerBERT model used for classification
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=4).to('cuda')


Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [11]:
# Function for computing probability of predictions given labels
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

In [12]:
entailment_model = model

In [13]:
# Evaluation on test set 

total_test_loss = 0
total_test_acc  = 0

labels_all = []
predictions_all = []
target_names = ["1", "2", "3", "4"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(test_loader):
  pair_token_ids = pair_token_ids.to(device)
  mask_ids = mask_ids.to(device)
  seg_ids = seg_ids.to(device)
  labels = y.to(device)
  
  loss, prediction = entailment_model(pair_token_ids,
                           token_type_ids=seg_ids,
                           attention_mask=mask_ids,
                           labels=labels).values()

  acc = multi_acc(prediction, labels)

  total_test_loss += loss.item()
  total_test_acc  += acc.item()

  labels_all += [l.item() for l in labels]
  predictions_all += [p.item() for p in torch.log_softmax(prediction, dim=1).argmax(dim=1)]

test_acc  = total_test_acc/len(test_loader)
test_loss = total_test_loss/len(test_loader)

print(f'Test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')

Test_loss: 1.3895 test_acc: 0.2545


In [14]:
# Results


from sklearn.metrics import classification_report
print(classification_report(labels_all, predictions_all, target_names=target_names, zero_division = 0 or 1))

              precision    recall  f1-score   support

           1       0.27      0.43      0.33       501
           2       0.28      0.33      0.30       509
           3       0.11      0.00      0.00       502
           4       0.21      0.26      0.23       504

    accuracy                           0.25      2016
   macro avg       0.22      0.25      0.22      2016
weighted avg       0.22      0.25      0.22      2016

