<a href="https://colab.research.google.com/github/KosukhaOlexandr/reactions_prediction/blob/main/rp_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import packages**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict
from torch.optim import Adam
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

!pip install transformers
!pip install sentencepiece
import transformers
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
from transformers import BertModel, BertTokenizer, get_polynomial_decay_schedule_with_warmup
from transformers import RobertaModel, RobertaTokenizer
from transformers import AutoModel, AutoTokenizer
from transformers import DebertaV2Tokenizer, DebertaV2Model




In [None]:
!ls

concatenated_pos_neg_bert.csv  sample_data


# **Data Preprocessing and visualizations**


## Load the data

In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#!pip install gdown
import gdown

all_url = 'https://drive.google.com/uc?id=1Q3ghJ2b4gCtcVULpNIGMyuQhZn9rwMX8'
tsn_url = 'https://drive.google.com/uc?id=1jGdkM0fJ9t64GNGIvQ9iydTfZxAc8QY8'
all_bert_url = 'https://drive.google.com/uc?id=1EtYz5GcnunPQw3pUFQVlBuloxbazYrfH'
output = "concatenated_pos_neg_bert.csv"
gdown.download(all_bert_url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1EtYz5GcnunPQw3pUFQVlBuloxbazYrfH
To: /content/concatenated_pos_neg_bert.csv
100%|██████████| 41.1M/41.1M [00:00<00:00, 226MB/s]


'concatenated_pos_neg_bert.csv'

In [None]:
import pandas as pd

data = pd.read_csv('concatenated_pos_neg_bert.csv', index_col = 0)
data.head(5)

data = data.iloc[:1000,:]

In [None]:
data.shape

(1000, 2)

## Config


In [None]:
class Config:

    # Model Config
    model_name = "microsoft/deberta-v2-xlarge"
    #can be
    # openai-gpt
    # roberta-base
    # model/ for lang-uk bert
    # microsoft/deberta-v2-xlarge for deberta
    max_len = 512

    hidden_size = 768
    hidden_size2 = 512

    batch_size = 16
    if model_name == "microsoft/deberta-v2-xlarge":
      max_len = 128
      hidden_size = 1536
      hidden_size2 = 2
      batch_size = 1


    # Data preparation
    test_fraction = 0.1
    validation_fraction = 0.1
    num_workers = 0
    classes = (1, 0)
    tags_map = {cls:i for i,cls in enumerate(classes)}
    logdir = 'logdir'

    # Training
    seed = 21
    epochs = 10
    learning_rate = 1e-5
    num_classes = len(classes)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(Config.seed)

## Data Preprocessing

In [None]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel

if Config.model_name == 'openai-gpt':
  tokenizer = OpenAIGPTTokenizer.from_pretrained(Config.model_name, max_length=Config.max_len)
  base_model = OpenAIGPTModel.from_pretrained(Config.model_name)
elif Config.model_name == 'roberta-base':
  tokenizer = RobertaTokenizer.from_pretrained(Config.model_name, truncation=True)
  base_model = RobertaModel.from_pretrained(Config.model_name)
elif Config.model_name == 'microsoft/deberta-v2-xlarge':
  tokenizer = DebertaV2Tokenizer.from_pretrained(Config.model_name)
  base_model = DebertaV2Model.from_pretrained(Config.model_name)
else:
  tokenizer = BertTokenizer.from_pretrained(Config.model_name)
  base_model = BertModel.from_pretrained(Config.model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v2-xlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sample_text = 'оголошена повітряна тривога будь ласка перейдіть до укриттів'
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'Original sentence: {sample_text}')
print(f'Tokenized sentence: {tokens}')
print(f'Token IDs: {token_ids}')

Original sentence: оголошена повітряна тривога будь ласка перейдіть до укриттів
Tokenized sentence: ['▁', 'ого', 'ло', 'ше', 'на', '▁по', 'в', 'і', 'тр', 'я', 'на', '▁', 'три', 'во', 'га', '▁буд', 'ь', '▁', 'ла', 'ска', '▁пере', 'йд', 'і', 'ть', '▁до', '▁у', 'кр', 'ит', 'т', 'ів']
Token IDs: [250, 59058, 43908, 71786, 30460, 14826, 19464, 9395, 87730, 21570, 30460, 250, 113739, 51502, 52279, 118832, 35394, 250, 26565, 92592, 71312, 118697, 9395, 50556, 38933, 19798, 86114, 40396, 23183, 119356]


In [None]:
if Config.model_name == 'openai-gpt':
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

encoding = tokenizer.encode_plus(
  sample_text,
  max_length=32,
  add_special_tokens=True,
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',
)

input_ids = encoding['input_ids']
attn_mask = encoding['attention_mask']
ids_to_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

print(f'Input IDs: {input_ids}')
print(f'Attention mask: {attn_mask}')
print(f'Padded text length: {len(input_ids[0])}')

Input IDs: tensor([[     1,    250,  59058,  43908,  71786,  30460,  14826,  19464,   9395,
          87730,  21570,  30460,    250, 113739,  51502,  52279, 118832,  35394,
            250,  26565,  92592,  71312, 118697,   9395,  50556,  38933,  19798,
          86114,  40396,  23183, 119356,      2]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])
Padded text length: 32


# **Choosing Sequence Length**

BERT model works with fixed-length sequences. So, we need to choose the max length we will use.





In [None]:
# VALIDATION_FIELD[func] get_token_lens

def get_token_lens(df):

    token_lens = []

    tk_len = df.apply(lambda x: len(tokenizer.encode(x.msg_text, max_length=Config.max_len, truncation=True)), axis = 1).to_numpy()
    return tk_len

In [None]:
sample_df = data.iloc[:5].copy()
sample_df_token_lens = get_token_lens(sample_df)
print('Token lens: ', sample_df_token_lens)

Token lens:  [ 87  92 128 128 128]


In [None]:
def select_rows_with_required_token_lens(df, max_len=Config.max_len):
    df['token_len'] = get_token_lens(df)
    return df.loc[df.token_len < max_len].reset_index(drop=True)

In [None]:
new_sample_df = select_rows_with_required_token_lens(sample_df)
new_sample_df

Unnamed: 0,msg_text,reaction_type,token_len
0,"Чий борщ? У п’ятницю, липня, ЮНЕСКО вирішить д...",1,87
1,Зруйнований міст у Демидові на Київщині планую...,1,92


# **Train/Test/Val Split**

In [None]:
new_data = select_rows_with_required_token_lens(data.copy())

train_to_rest = Config.validation_fraction + Config.test_fraction
test_to_valid = Config.validation_fraction / train_to_rest

train_df, rest = train_test_split(new_data, random_state=Config.seed, test_size=train_to_rest)

test_df, valid_df = train_test_split(rest, random_state=Config.seed,test_size=test_to_valid)

print('Train data:', train_df.shape[0])
print('Valid data:', valid_df.shape[0])
print('Test data:', test_df.shape[0])

Train data: 392
Valid data: 49
Test data: 49


# **Dataset class and Dataloaders**
We will define our custom PyTorch Dataset to load the quotes and their tags as one data sample.


In [None]:
class NewsDataset(torch.utils.data.Dataset):

    def __init__(self, df, max_len=Config.max_len):
        self.tags = df.reaction_type.map(Config.tags_map).to_numpy()
        self.quotes = df.msg_text.apply(
            tokenizer.encode_plus, padding='max_length',
                                  max_length=max_len, truncation=True,
                                  return_tensors="pt", return_attention_mask=True
                              ).to_numpy()
    def __len__(self):
        return len(self.quotes)

    def get_batch_tags(self, idx):
        return self.tags[idx]

    def get_batch_quotes(self, idx):
        return self.quotes[idx]

    def __getitem__(self, idx):
        batch_quotes = self.get_batch_quotes(idx)
        batch_y = self.get_batch_tags(idx)

        return batch_quotes, batch_y

In [None]:
news_ds = NewsDataset(sample_df, max_len=10)
news_data, tag = news_ds[3]
print(news_data)
print("Input_ids:", news_data['input_ids'])
#print("Token_type_ids:", quote_data['token_type_ids'])
print("Attention_mask:", news_data['attention_mask'])
print("Edcoded tag:", tag)

{'input_ids': tensor([[     1,  66974,  86114,  31290,  23384, 119356,  62182,  67937, 116002,
              2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Input_ids: tensor([[     1,  66974,  86114,  31290,  23384, 119356,  62182,  67937, 116002,
              2]])
Attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Edcoded tag: 0


In [None]:
train_data = NewsDataset(train_df)
valid_data = NewsDataset(valid_df)
test_data = NewsDataset(test_df)

train_dataloader = DataLoader(train_data, batch_size=Config.batch_size, shuffle=True, num_workers=2)
valid_dataloader = DataLoader(valid_data, batch_size=Config.batch_size, shuffle=False, num_workers=2)
test_dataloader = DataLoader(test_data, batch_size=Config.batch_size, shuffle=False, num_workers=2)

# **Creating the model class**

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, hidden_size1, hidden_size2, num_classes):
        super(BertClassifier, self).__init__()
        self.base_m = base_model
        self.dropout = nn.Dropout(0.35)
        self.fc1 = nn.Linear(hidden_size1, hidden_size2)
        self.fc2 = nn.Linear(hidden_size2, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_id, mask):
        seq_output = self.base_m(input_id, attention_mask=mask, return_dict=False)

        text_emb = self.mean_pooling(seq_output, mask)

        final_layer = self.fc1(text_emb)

        final_layer = self.relu(final_layer)

        final_layer = self.dropout(final_layer)

        final_layer = self.fc2(final_layer)

        final_layer = self.softmax(final_layer)

        return final_layer

In [None]:
set_seed(Config.seed)
bert_Classifier = BertClassifier(Config.hidden_size, Config.hidden_size2, Config.num_classes)
print(input_ids.shape, attn_mask.shape)
print('\nOutput:', bert_Classifier(input_ids, attn_mask)) # input_ids: [1,32], attn_mask:[1,32]
print('Output shape:', bert_Classifier(input_ids, attn_mask).shape)

torch.Size([1, 32]) torch.Size([1, 32])

Output: tensor([[-0.3710, -1.1712]], grad_fn=<LogSoftmaxBackward0>)
Output shape: torch.Size([1, 2])


In [None]:
bert_Classifier

BertClassifier(
  (base_m): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1536, padding_idx=0)
      (LayerNorm): LayerNorm((1536,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1536, out_features=1536, bias=True)
              (key_proj): Linear(in_features=1536, out_features=1536, bias=True)
              (value_proj): Linear(in_features=1536, out_features=1536, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1536, out_features=1536, bias=True)
              (LayerNorm): LayerNorm((1536,), eps=1e-07, elementwise_affine=True)
   

In [None]:
!pip install torchmetrics



In [None]:
from torchmetrics.classification import BinaryF1Score
from sklearn.metrics import f1_score

# **Training the model**


In [None]:
def train(model, optimizer, scheduler, criterion, train_loader, device=Config.device):
    model.train()
    pbar = tqdm(train_loader, desc='Iterating over train data')

    total_loss_train = 0
    total_acc_train = 0
    step_cuda = 0
    f1_labels = []
    f1_outputs = []
    for train_input, train_label in pbar:
        train_label = train_label.to(device)
        mask = torch.squeeze(train_input['attention_mask'], 1).to(device)
        input_id = torch.squeeze(train_input['input_ids'], 1).to(device)

        # forward
        model.zero_grad()

        output = model(input_id, mask)
        batch_loss = criterion(output, train_label)
        batch_loss.backward()
        total_loss_train += batch_loss.item()

        out_max = torch.argmax(output, dim=1).cpu().detach().numpy()
        acc = (out_max == train_label.cpu().detach().numpy()).sum()
        total_acc_train += acc

        f1_labels.append(train_label.cpu().detach().numpy())
        f1_outputs.append(out_max)

        # optimize
        optimizer.step()
        scheduler.step()

    f1_labels = np.concatenate(f1_labels)
    f1_outputs = np.concatenate(f1_outputs)
    print(f1_labels, f1_outputs)
    f1 = f1_score(f1_labels, f1_outputs)

    return total_loss_train/ len(train_loader.dataset), total_acc_train/ len(train_loader.dataset), f1

In [None]:
set_seed(Config.seed)
model = BertClassifier(Config.hidden_size, Config.hidden_size2, Config.num_classes).to(Config.device)
optimizer = Adam(model.parameters(), lr=Config.learning_rate)
# try stochastic
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=4,
              num_training_steps=int(len(train_dataloader) * Config.epochs) , power=2)

criterion = nn.CrossEntropyLoss()

loss, acc, f1 = train(model, optimizer, scheduler ,criterion, test_dataloader)
print(f'\nTrain Loss: {loss: .3f} | Train Accuracy: {acc: .3f} | Train F1-score: {acc: .3f}')

OutOfMemoryError: ignored

## Evaluation loop

In [None]:
def evaluate(model, criterion, eval_loader, device=Config.device):
    model.eval()

    total_acc_val = 0
    total_loss_val = 0

    f1_labels = []
    f1_outputs = []

    with torch.no_grad():
        pbar = tqdm(eval_loader, desc='Iterating over evaluation data')
        for val_input, val_label in pbar:
            # pass to device
            val_label = val_label.to(device)
            mask = torch.squeeze(val_input['attention_mask'], 1).to(device)
            input_id = torch.squeeze(val_input['input_ids'], 1).to(device)

            # forward

            output = model(input_id, mask)

            batch_loss = criterion(output, val_label).item()      #calculate loss
            total_loss_val += batch_loss                   # += loss

            out_max = torch.argmax(output, dim=1).cpu().detach().numpy()
            acc = (out_max == val_label.cpu().detach().numpy()).sum()    # calculate accuracy
            total_acc_val += acc                    # += acc

            f1_labels.append(val_label.cpu().detach().numpy())
            f1_outputs.append(out_max)

    f1_labels = np.concatenate(f1_labels)
    f1_outputs = np.concatenate(f1_outputs)

    f1 = f1_score(f1_labels, f1_outputs)
    return total_loss_val/ len(eval_loader.dataset), total_acc_val/ len(eval_loader.dataset), f1

In [None]:
"""set_seed(Config.seed)
model = BertClassifier(Config.hidden_size, Config.num_classes).to(Config.device)
criterion = nn.CrossEntropyLoss()

#loss, acc = evaluate(model, criterion, valid_dataloader)
#print(f'\nEval Loss: {loss: .3f} | Eval Accuracy: {acc: .3f}')"""

In [None]:
def train_loop(model, optimizer, scheduler, criterion, train_loader, valid_loader, device=Config.device, num_epochs=Config.epochs, logdir=Config.logdir):

    history = defaultdict(list)
    best_accuracy = 0

    tb_writer = SummaryWriter(log_dir=logdir)
    for e in range(num_epochs):

        print(f'Epoch {e + 1}/{num_epochs}')

        # train on training set
        train_loss, train_acc, train_f1 = train(model, optimizer, scheduler, criterion, train_loader, device=device)
        # evaluate on validation set
        val_loss, val_acc, val_f1 = evaluate(model, criterion, valid_loader, device=device)

        print(f'\nTrain Loss {train_loss: .3f} | Val Loss {val_loss: .3f}')
        print(f'Train Accuracy {train_acc: .3f} | Val Accuracy {val_acc: .3f}')
        print(f'Train F1-score {train_f1: .3f} | Val F1-Score {val_f1: .3f}')
        print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['train_f1'].append(train_f1)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)
        history['val_f1'].append(val_f1)

        # Tensorboards Logging
        tb_writer.add_scalar('Bert/Train Loss', train_loss, e)
        tb_writer.add_scalar('Bert/Valid Loss', val_loss, e)
        tb_writer.add_scalar('Bert/Train Accuracy', train_acc, e)
        tb_writer.add_scalar('Bert/Valid Accuracy', val_acc, e)
        tb_writer.add_scalar('Bert/Train F1-score', train_f1, e)
        tb_writer.add_scalar('Bert/Val F1-score', val_f1, e)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc

    return history

In [None]:
set_seed(Config.seed)
model = BertClassifier(Config.hidden_size, Config.hidden_size2, Config.num_classes).to(Config.device)
optimizer = Adam(model.parameters(), lr=Config.learning_rate)

scheduler = get_polynomial_decay_schedule_with_warmup(optimizer,
                                  num_warmup_steps=4,
                                  num_training_steps=int(len(train_dataloader) * Config.epochs) , power=2)

criterion = nn.CrossEntropyLoss()
history = train_loop(model, optimizer, scheduler, criterion, train_dataloader, valid_dataloader)

In [None]:
%load_ext tensorboard
%tensorboard --logdir=logdir

In [None]:
fig, ax =  plt.subplots(2, 1, figsize=(10, 6))
ax[0].plot(history['train_acc'], label='train accuracy')
ax[0].plot(history['val_acc'], label='validation accuracy')
ax[0].set_title('Accuracy')
ax[0].legend()
ax[1].plot(history['train_loss'], label='train loss')
ax[1].plot(history['val_loss'], label='validation loss')
ax[1].set_title('Losses')
ax[1].legend()
plt.tight_layout()
plt.show()

In [None]:
# Load the latest model
model.load_state_dict(torch.load('best_model_state.bin'))
loss, acc, f1 = evaluate(model, criterion, test_dataloader)

print(f'\nTest Loss: {loss : .3f} | Test Accuracy {acc : .3f} | Test F1-Score {f1 : .3f}')

# **Predicting on Raw Text**


Use the tokenizer to encode the text:

In [None]:
encoded_quote = tokenizer.encode_plus(
  quote_text,
  max_length=Config.max_len,
  add_special_tokens=True,
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt')

Get the predictions from our model:

In [None]:
input_ids = encoded_quote['input_ids'].to(Config.device)
attention_mask = encoded_quote['attention_mask'].to(Config.device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Review text: {quote_text}')
print(f'Quotes tag: {Config.classes[prediction]}')