<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/Emma/bert_classification_more_epochs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bpemb
!pip install gensim
!pip install datasets
!pip install transformers
!python -m spacy download en_core_web_sm



In [None]:
# add-in as occasionally receive an error which requires this to be added
# uncomment if the issue arises
!pip install transformer[torch]

In [None]:
import os
import numpy as np
from collections import Counter
import torch
import datasets
datasets.logging.set_verbosity_error()
from datasets import load_metric
from google.colab import drive
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

# # uncomment if CAN'T CONNECT TO GPU (it happens...)
# import psutil
# import platform

In [None]:
# to save output of models so they can be reloaded

from google.colab import drive
drive.mount('/content/drive')
output_dir = '/content/drive/My Drive/Colab Notebooks/NLP/'


In [None]:
# GPU housekeeping code: you do not need to modify anything, simply
# read through it to understand what is going on, and run as is

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# a helper function to format byte counts into KB, MB and so on
def bytes_format(b):
    if b < 1000:
              return f'{b} B'
    elif b < 1000000:
        return f'{round(float(b/1000),2)} KB'
    elif b < 1000000000:
        return f'{round(float(b/1000000),2)} MB'
    else:
        return f'{round(float(b/1000000000),2)} GB'

# a helper function to check the amount of available memory
def memory_report():
  if device!='cpu':
    print(f"GPU available: {torch.cuda.get_device_name()}")
    #print(torch.cuda.memory_summary())
    total = torch.cuda.get_device_properties(0).total_memory
    reserved = torch.cuda.memory_reserved(0)
    allocated = torch.cuda.memory_allocated(0)
  #  free = reserved-allocated  # free inside memory_reserved
    print(f"Total cuda memory: {bytes_format(total)}, reserved: {bytes_format(reserved)}, allocated: {bytes_format(allocated)}")
  else:
    # Print total memory available on CPU
    print(f'Device is CPU {platform.processor()}. GPU is not available rn')
    total_memory = psutil.virtual_memory().total
    print(f"Total CPU memory: {bytes_format(total_memory)}")

memory_report()

In [None]:
# Preamble
import sys

sys.path.append('..')

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


In [None]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [None]:
# Create a new dataframe with the combined documents and questions and add if they are answerable
df_train_bengali_merged = pd.DataFrame({
    'text':(df_train_bengali["document_plaintext"] + df_train_bengali["question_text"]),
    'answerable':(df_train_bengali["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_train_arabic_merged = pd.DataFrame({
    'text': (df_train_arabic["document_plaintext"] + df_train_arabic["question_text"]),
    'answerable': (df_train_arabic["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
                                    })
df_train_indonesian_merged = pd.DataFrame({
    'text':(df_train_indonesian["document_plaintext"] + df_train_indonesian["question_text"]),
    'answerable':(df_train_indonesian["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_train_english_merged = pd.DataFrame({
    'text':(df_train_english["document_plaintext"] + df_train_english["question_text"]),
    'answerable':(df_train_english["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })


## Same for validation data
df_val_bengali_merged = pd.DataFrame({
    'text':(df_val_bengali["document_plaintext"] + df_val_bengali["question_text"]),
    'answerable':(df_val_bengali["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_val_arabic_merged = pd.DataFrame({
    'text': (df_val_arabic["document_plaintext"] + df_val_arabic["question_text"]),
    'answerable': (df_val_arabic["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
                                    })
df_val_indonesian_merged = pd.DataFrame({
    'text':(df_val_indonesian["document_plaintext"] + df_val_indonesian["question_text"]),
    'answerable':(df_val_indonesian["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_val_english_merged = pd.DataFrame({
    'text':(df_val_english["document_plaintext"] + df_val_english["question_text"]),
    'answerable':(df_val_english["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })

df_val_english_merged.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize and encode the text data
def tokenize_text(texts, max_length=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids.append(encoded_text["input_ids"])
        attention_masks.append(encoded_text["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# # first for english to test
# train_input_ids, train_attention_masks = tokenize_text(df_train_english_merged["text"].tolist())
# val_input_ids, val_attention_masks = tokenize_text(df_val_english_merged["text"].tolist())
# train_labels = torch.tensor(df_train_english_merged["answerable"].tolist())
# val_labels = torch.tensor(df_val_english_merged["answerable"].tolist())


In [None]:
# # with english dataset
# batch_size = 32

# train_data = TensorDataset(train_input_ids.to('cuda'), train_attention_masks.to('cuda'), train_labels.to('cuda'))
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# val_data = TensorDataset(val_input_ids.to('cuda'), val_attention_masks.to('cuda'), val_labels.to('cuda'))
# val_sampler = RandomSampler(val_data)
# val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)



In [None]:
# checking if cuda is available
print(torch.cuda.is_available())


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2, output_attentions=True)
model.cuda()  # Use GPU for training if available


In [None]:
# the below is taken from the ASDS exam - check whether this is the way to do it or combine with the rest
# first check for english version and then adapt for other languages
!pip install transformers[torch] accelerate



In [None]:
#define parameters for the model
training_args = TrainingArguments(output_dir="my_trainer",
                                  evaluation_strategy="steps",
                                  num_train_epochs=3.0,
                                  per_device_train_batch_size=16,
                                  eval_steps=500
                                  )

In [None]:
# define the compute_metrics function for the trainer
metric_f1 = load_metric('f1')
metric_ac = load_metric('accuracy')

def compute_metrics(eval_pred):
    outputs, labels = eval_pred
    predictions = np.argmax(outputs, axis=-1)
    f1 = metric_f1.compute(predictions=predictions, references=labels)
    ac = metric_ac.compute(predictions=predictions, references=labels)
    return f1 | ac

In [None]:
# For Indonesian

train_input_ids_indonesian, train_attention_masks_indonesian = tokenize_text(df_train_indonesian_merged["text"].tolist())
val_input_ids_indonesian, val_attention_masks_indonesian = tokenize_text(df_val_indonesian_merged["text"].tolist())
train_labels_indonesian = torch.tensor(df_train_indonesian_merged["answerable"].tolist())
val_labels_indonesian = torch.tensor(df_val_indonesian_merged["answerable"].tolist())

batch_size = 32

train_data_indonesian = TensorDataset(train_input_ids_indonesian.to('cuda'), train_attention_masks_indonesian.to('cuda'), train_labels_indonesian.to('cuda'))
train_sampler_indonesian = RandomSampler(train_data_indonesian)
train_dataloader_indonesian = DataLoader(train_data_indonesian, sampler=train_sampler_indonesian, batch_size=batch_size)

val_data_indonesian = TensorDataset(val_input_ids_indonesian.to('cuda'), val_attention_masks_indonesian.to('cuda'), val_labels_indonesian.to('cuda'))
val_sampler_indonesian = SequentialSampler(val_data_indonesian)
val_dataloader_indonesian = DataLoader(val_data_indonesian, sampler=val_sampler_indonesian, batch_size=batch_size)



In [None]:
# define the trainer object
trainer_indonesian = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_input_ids_indonesian,
    eval_dataset=val_input_ids_indonesian,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader_indonesian) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
model = model.to("cuda")


# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0  # Initialize the total loss for the epoch

    for batch in tqdm(train_dataloader_indonesian, desc=f"Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]

        model.zero_grad()
        outputs = model(*inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(train_dataloader_indonesian)  # Compute the average loss for the epoch

    model.eval()
    predictions = []
    true_labels = []
    for batch in tqdm(val_dataloader_indonesian, desc=f"Evaluating Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]
        with torch.no_grad():
            outputs = model(*inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).tolist())
        true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=["Not Answerable", "Answerable"])
    print(f"Epoch {epoch + 1} - Accuracy: {accuracy:.4f} - Average Loss: {average_loss:.4f}")
    print(report)

    # Saving to Google Drive
    save_path = "/content/drive/My Drive/Colab Notebooks/NLP/"
    with open(save_path + f"indonesian_results_epoch_{epoch}.txt", 'w') as f:
        f.write(f"Epoch {epoch + 1} - Accuracy: {accuracy:.4f}\n")
        f.write(report)

In [None]:
# For Bengali

train_input_ids_bengali, train_attention_masks_bengali = tokenize_text(df_train_bengali_merged["text"].tolist())
val_input_ids_bengali, val_attention_masks_bengali = tokenize_text(df_val_bengali_merged["text"].tolist())
train_labels_bengali = torch.tensor(df_train_bengali_merged["answerable"].tolist())
val_labels_bengali = torch.tensor(df_val_bengali_merged["answerable"].tolist())

batch_size = 32

train_data_bengali = TensorDataset(train_input_ids_bengali.to('cuda'), train_attention_masks_bengali.to('cuda'), train_labels_bengali.to('cuda'))
train_sampler_bengali = RandomSampler(train_data_bengali)
train_dataloader_bengali = DataLoader(train_data_bengali, sampler=train_sampler_bengali, batch_size=batch_size)

val_data_bengali = TensorDataset(val_input_ids_bengali.to('cuda'), val_attention_masks_bengali.to('cuda'), val_labels_bengali.to('cuda'))
val_sampler_bengali = SequentialSampler(val_data_bengali)
val_dataloader_bengali = DataLoader(val_data_bengali, sampler=val_sampler_bengali, batch_size=batch_size)


In [None]:
# define the trainer object
trainer_bengali = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_input_ids_bengali,
    eval_dataset=val_input_ids_bengali,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader_bengali) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
# Initialize a list to store losses
train_losses = []

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0  # Initialize the total loss for the epoch

    for batch in tqdm(train_dataloader_bengali, desc=f"Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]

        model.zero_grad()
        outputs = model(*inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(train_dataloader_bengali)  # Compute the average loss for the epoch

    model.eval()
    predictions = []
    true_labels = []
    for batch in tqdm(val_dataloader_bengali, desc=f"Evaluating Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]
        with torch.no_grad():
            outputs = model(*inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).tolist())
        true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=["Not Answerable", "Answerable"])
    print(f"Epoch {epoch + 1} - Accuracy: {accuracy:.4f} - Average Loss: {average_loss:.4f}")
    print(report)


In [None]:
# For Arabic

train_input_ids_arabic, train_attention_masks_arabic = tokenize_text(df_train_arabic_merged["text"].tolist())
val_input_ids_arabic, val_attention_masks_arabic = tokenize_text(df_val_arabic_merged["text"].tolist())
train_labels_arabic = torch.tensor(df_train_arabic_merged["answerable"].tolist())
val_labels_arabic = torch.tensor(df_val_arabic_merged["answerable"].tolist())

batch_size = 32

train_data_arabic = TensorDataset(train_input_ids_arabic.to('cuda'), train_attention_masks_arabic.to('cuda'), train_labels_arabic.to('cuda'))
train_sampler_arabic = RandomSampler(train_data_arabic)
train_dataloader_arabic = DataLoader(train_data_arabic, sampler=train_sampler_arabic, batch_size=batch_size)

val_data_arabic = TensorDataset(val_input_ids_arabic.to('cuda'), val_attention_masks_arabic.to('cuda'), val_labels_arabic.to('cuda'))
val_sampler_arabic = SequentialSampler(val_data_arabic)
val_dataloader_arabic = DataLoader(val_data_arabic, sampler=val_sampler_arabic, batch_size=batch_size)


In [None]:
# define the trainer object
trainer_arabic = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_input_ids_arabic,
    eval_dataset=val_input_ids_arabic,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader_arabic) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
model = model.to("cuda")


# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0  # Initialize the total loss for the epoch

    for batch in tqdm(train_dataloader_arabic, desc=f"Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]

        model.zero_grad()
        outputs = model(*inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(train_dataloader_arabic)  # Compute the average loss for the epoch

    model.eval()
    predictions = []
    true_labels = []
    for batch in tqdm(val_dataloader_arabic, desc=f"Evaluating Epoch {epoch + 1}"):
        inputs = batch[:2]
        labels = batch[2]
        with torch.no_grad():
            outputs = model(*inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).tolist())
        true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=["Not Answerable", "Answerable"])
    print(f"Epoch {epoch + 1} - Accuracy: {accuracy:.4f} - Average Loss: {average_loss:.4f}")
    print(report)

In [None]:
# attempt for week 41 exercise

# Assuming you want to visualize attention for a specific instance (e.g., the first instance in the validation dataset)
instance_index = 0

# Prepare input for the selected instance
inputs = {
    'input_ids': val_input_ids_arabic[instance_index].unsqueeze(0).to('cuda'),
    'attention_mask': val_attention_masks_arabic[instance_index].unsqueeze(0).to('cuda')
}

# Pass the input through the model to get attentions
with torch.no_grad():
    outputs = model(**inputs)
attentions_arabic = outputs.attentions  # This will contain attention scores for different layers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming you want to visualize attention for layer 3, head 0
layer_idx = 3
head_idx = 0

# Get the attention scores for the selected layer and head
attention_matrix = attentions_arabic[layer_idx][0][head_idx].cpu().numpy()

# Create a heatmap
plt.figure(figsize=(8, 8))
sns.heatmap(attention_matrix, cmap='YlGnBu', xticklabels=False, yticklabels=False)
plt.title(f'Attention Map for Layer {layer_idx}, Head {head_idx}')
plt.show()
