# Finetune BERT for BNPP

# Configuration

Only configure these 2 code sections 

In [None]:
PRETRAINED_MODEL_NAME = "bert-base-multilingual-cased"
# PRETRAINED_MODEL_NAME = "csebuetnlp/banglabert"
BATCH_SIZE = 8
MAX_LEN = 512
EPOCHS = 20
LEARNING_RATE = 1e-5
MODEL_PATH = f'{PRETRAINED_MODEL_NAME.replace("/", "-")}_{BATCH_SIZE}_{LEARNING_RATE}.bin'

"""
Other Huggingface Models
csebuetnlp/banglabert
sagorsarker/bangla-bert-base
bert-base-multilingual-uncased
bert-base-multilingual-cased
distilbert-base-multilingual-cased
neuropark/sahajBERT
"""

In [None]:
training_dataset = "name_of_the_dataset_to_be_trained_on"
path_to_dataset = "/path/to/dataset"
train_file_path = f"{path_to_dataset}/training_dataset.csv"
test_file_path = f"{path_to_dataset}/testing_dataset.csv"
val_file_path = f"{path_to_dataset}/TaPaCo_val.csv"
testing_files = [
    "BnPC_test_2_label",
    "TaPaCo_test",
    "indic_test_bnpc_1",
    "indic_test_bnpc_2",
    "indic_test_bnpc_3",    
    "BUET_test_bnpc_1",
    "BUET_test_bnpc_2",
    "BUET_test_bnpc_3"
]

## Installing Necessary Packages

In [2]:
! pip install transformers



## Downloading Dataset

## Importing Necessary Packages and Switching to GPU if Available

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
import torch
import re
import pickle

from torch.utils.data import Dataset, DataLoader
from torch import nn

from transformers import AutoTokenizer, AutoModel 

from collections import defaultdict

from sklearn.metrics import classification_report, f1_score, accuracy_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Data Loader

In [4]:
# Function to hide numbers from a sentence
def hide_numbers(df, token=' [CC] '):
  df.sentence1 = [re.sub(r"[^ঀ-ৣ ]", '', s).strip() for s in df.sentence1]
  df.sentence2 = [re.sub(r"[^ঀ-ৣ ]", '', s).strip() for s in df.sentence2]
  return df

# Load dataset from file
def load_data(train_path, test_path, val_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    val_df = pd.read_csv(val_path)

    train_df = hide_numbers(train_df)
    test_df = hide_numbers(test_df)
    val_df = hide_numbers(val_df)

    return train_df, test_df, val_df

# Dataset helper class
class BNPPDataset(Dataset):
    def __init__(self, sentence1, sentence2, labels, tokenizer, max_len):
        self.sentence1 = sentence1
        self.sentence2 = sentence2
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        s1 = str(self.sentence1[item])
        s2 = str(self.sentence2[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            s1, s2,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'sentence1': s1,
            'sentence2': s2,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Creating data loaders
def create_data_loader(df, tokenizer, max_len, batch_size, shuffle=False):
    ds = BNPPDataset(
        sentence1=df.sentence1.to_numpy(),
        sentence2=df.sentence2.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2,
        shuffle=shuffle
    )

# Exporting data loaders
def get_dataloader(train_path, test_path, val_path, tokenizer, MAX_LEN, BATCH_SIZE):
    train_df, test_df, val_df = load_data(train_path, test_path, val_path)

    train_data_loader = create_data_loader(
        train_df, tokenizer, MAX_LEN, BATCH_SIZE, shuffle=True)
    test_data_loader = create_data_loader(
        test_df, tokenizer, MAX_LEN, BATCH_SIZE)
    val_data_loader = create_data_loader(
        val_df, tokenizer, MAX_LEN, BATCH_SIZE)

    return train_data_loader, test_data_loader, val_data_loader

## Building the Model

In [5]:
class ParaphraseIdentifier(nn.Module):
    def __init__(self, model_name):
        super(ParaphraseIdentifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 2)
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        drop_output = self.drop(pooled_output)
        linear_output = self.linear(drop_output)
        out = self.activation(linear_output)

        return out

## Training and Testing Helper Functions

In [6]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        optimizer.zero_grad()
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        preds = outputs.argmax(-1).tolist()
        loss = loss_fn(outputs, labels)

        cp = 0
        for i, v in enumerate(labels.tolist()):
            if v == preds[i]:
                cp += 1

        correct_predictions += cp
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions / n_examples, np.mean(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = outputs.argmax(-1).tolist()
            loss = loss_fn(outputs, labels)

            cp = 0
            for i, v in enumerate(labels.tolist()):
                if v == preds[i]:
                    cp += 1

            correct_predictions += cp
            losses.append(loss.item())

    return correct_predictions / n_examples, np.mean(losses)

## Variables to Configure the Dataset

## Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
train_data_loader, test_data_loader, val_data_loader = get_dataloader(
    train_file_path, test_file_path, val_file_path, tokenizer, MAX_LEN, BATCH_SIZE)
train_df, test_df, val_df = load_data(train_file_path, test_file_path, val_file_path)
model = nn.DataParallel(ParaphraseIdentifier(PRETRAINED_MODEL_NAME)).to(device)

optimizer = transformers.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, 0, total_steps)
loss_fn = nn.NLLLoss().to(device)

history = defaultdict(list)
best_accuracy = 0
best_epoch = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df))

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model, val_data_loader, loss_fn, device, len(val_df))

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), MODEL_PATH)
        print(f'Saved model with accuracy {val_acc} on: {MODEL_PATH}')
        best_accuracy = val_acc
        best_epoch = epoch

    if epoch - best_epoch > 5:
        print('Not good accuracy for last 5 epochs. Cancelling...')
        break

In [None]:
Pkl_Filename = f'{training_dataset}_{PRETRAINED_MODEL_NAME.replace("/", "-")}_{BATCH_SIZE}_{LEARNING_RATE}.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

## Plotting Training History

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

## Running test on test dataset

In [None]:
train_data_loader, test_data_loader, val_data_loader = get_dataloader(
    train_file_path, test_file_path, val_file_path,
    tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
loaded_model = nn.DataParallel(ParaphraseIdentifier(PRETRAINED_MODEL_NAME)).to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
test_acc, _ = eval_model(loaded_model, test_data_loader, loss_fn, device, len(test_df))
print(f'Test Accuracy: {test_acc}')

## Getting predictions

In [None]:
def get_predictions(model, data_loader, device):
    model = model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device).tolist()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = outputs.argmax(-1).tolist()

            all_labels.extend(labels)
            all_preds.extend(preds)

    return all_labels, all_preds

## Checking Model Performance

In [None]:
labels, preds = get_predictions(loaded_model, val_data_loader, device)
score  = accuracy_score(labels, preds)
report = classification_report(labels, preds, digits=4, output_dict=True)
print("Validation")
print(score)
print(report)
pred_df = pd.DataFrame(report)
pred_df.to_csv(f'{training_dataset}_Validation.csv')

In [None]:
labels, preds = get_predictions(loaded_model, test_data_loader, device)
print("BnPC")
score  = accuracy_score(labels, preds)
report = classification_report(labels, preds, digits=4, output_dict=True)
print(score)
print(report)
pred_df = pd.DataFrame(report)
pred_df.to_csv(f'{training_dataset}_Test_BnPC.csv')

In [None]:
for i in testing_files:
    test_df = pd.read_csv(f'{path_to_dataset}/{i}.csv')
    test_df = hide_numbers(test_df)
    test_data_loader = create_data_loader(
            test_df, tokenizer, MAX_LEN, BATCH_SIZE)
    labels, preds = get_predictions(loaded_model, test_data_loader, device)
    print(f'{i}')
    score  = accuracy_score(labels, preds)
    report = classification_report(labels, preds, digits=4, output_dict=True)
    print(score)
    print(report)
    pred_df = pd.DataFrame(report)
    pred_df.to_csv(f'{training_dataset}_{i}.csv')