In [52]:
!pip install -U datasets
#load_dataset sometimes hangs on a higher version
!pip install transformers



# Preprocessing

In [53]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from collections import Counter
from transformers import DistilBertModel, DistilBertTokenizerFast
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

'\nSome options for BERT model that can be run in colab:\n\n"distilbert-base-uncased",\n"distilbert-base-uncased-distilled-squad",\n"distilbert-base-cased",\n"distilbert-base-cased-distilled-squad",\n\n'

In [56]:
FOLDER = "/content/drive/My Drive/Colab Notebooks/CSCI1460/Final"
FILEPATH_TRAIN = f"{FOLDER}/all_train.json"
FILEPATH_DEV = f"{FOLDER}/all_dev.json"
data_files = {"train": FILEPATH_TRAIN, "dev": FILEPATH_DEV}
dataset = load_dataset('json', data_files=data_files)

In [57]:
def load_data():
    train = dataset["train"]
    validation = dataset["dev"]
    return train, validation

In [58]:
def preprocess_and_tokenize(data, tokenizer, max_length=384, batch_size=64):
    '''
    This function preprocesses and tokenizes the given dataset for question answering tasks.
    Parameters:
        data (list): A list of dictionaries containing 'questions', 'contexts', and 'answers'.
        tokenizer: The tokenizer to be used, which should be compatible with the transformer model in use.
        max_length (int, optional): The maximum number of tokens for each encoded example. Defaults to 384.
        batch_size (int, optional): The batch size for the DataLoader. Defaults to 64.

    Returns:
        DataLoader: A PyTorch DataLoader containing tokenized and encoded data ready for training.
    '''
    class MyQADataset(torch.utils.data.Dataset):
        def __init__(self, data, tokenizer, max_length=384):
            self.tokenizer = tokenizer
            self.max_length = max_length
            self.data = data

            self.tokenizer.add_special_tokens({
                "cls_token": "<CLS>",
                "sep_token": "<SEP>",
            })

            self.input_ids_list = []
            self.attention_mask_list = []
            self.start_pos_list = []
            self.end_pos_list = []
            self.answer_label_list = []

            for example in data:
                input_ids, attention_mask, start_pos, end_pos, answer_label = self._encode_example(example)
                self.input_ids_list.append(input_ids)
                self.attention_mask_list.append(attention_mask)
                self.start_pos_list.append(start_pos)
                self.end_pos_list.append(end_pos)
                self.answer_label_list.append(answer_label)

        def _encode_example(self, example):
            question = example['questions'][0]['input_text']
            context = example['contexts']
            answer = example['answers']

            encodings = self.tokenizer(
                question,
                context,
                max_length=self.max_length,
                padding='max_length',
                return_attention_mask=True,
                return_offsets_mapping=True,
                return_token_type_ids=True,
                truncation=True
            )

            start_char_idx = answer[0]['span_start']
            end_char_idx = answer[0]['span_end']
            answer_label = 1 if answer[0]['input_text'] == 'short' else 0
            offset_mapping = encodings['offset_mapping']
            token_type_ids = encodings['token_type_ids']

            start_token_idx, end_token_idx = 0, 0
            ctx_start_idx = token_type_ids.index(1)
            ctx_end_idx = len(token_type_ids) - token_type_ids[::-1].index(1) - 1

            if answer_label == 1:
                for i, offset in enumerate(offset_mapping[ctx_start_idx:ctx_end_idx+1], start=ctx_start_idx):
                    if offset[0] <= start_char_idx < offset[1]:
                        start_token_idx = i
                    if offset[0] < end_char_idx <= offset[1]:
                        end_token_idx = i
                    if start_token_idx != 0 and end_token_idx != 0:
                        break

            return (
                torch.tensor(encodings['input_ids']),
                torch.tensor(encodings['attention_mask']),
                torch.tensor(start_token_idx),
                torch.tensor(end_token_idx),
                torch.tensor(answer_label)
            )

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return {
                'input_ids': self.input_ids_list[idx],
                'attention_mask': self.attention_mask_list[idx],
                'start_pos': self.start_pos_list[idx],
                'end_pos': self.end_pos_list[idx],
                'answer_label': self.answer_label_list[idx],
            }

    dataset = MyQADataset(data, tokenizer, max_length=max_length)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

In [59]:
class MyBertQAModel(nn.Module):
    '''
    This class defines a BERT-based model for question answering tasks. It utilizes a DistilBert model as the backbone for feature extraction.
    Parameters:
        pretrained_model_name (str, optional): The name of the pre-trained BERT model to use. Defaults to "distilbert-base-uncased".

    Returns:
        tuple: A tuple containing three tensors:
            - start_logits (torch.Tensor): Logits for the start positions of the answers [batch, seq_len].
            - end_logits (torch.Tensor): Logits for the end positions of the answers [batch, seq_len].
            - answer_logits (torch.Tensor): Logits for the type of the answers [batch, 2].
    '''
    def __init__(self, pretrained_model_name="distilbert-base-uncased"):
        super(MyBertQAModel, self).__init__()
        self.model = DistilBertModel.from_pretrained(pretrained_model_name)
        hidden_size = self.model.config.hidden_size
        self.start_layer = nn.Linear(hidden_size, 1)
        self.end_layer = nn.Linear(hidden_size, 1)
        self.answer_layer = nn.Linear(hidden_size, 2)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        cls_output = sequence_output[:, 0, :]        # [batch, hidden_size]

        start_logits = self.start_layer(sequence_output).squeeze(-1) # [batch, seq_len]
        end_logits = self.end_layer(sequence_output).squeeze(-1)     # [batch, seq_len]
        answer_logits = self.answer_layer(self.dropout(cls_output))  # [batch, 2]

        return start_logits, end_logits, answer_logits


In [60]:
def load_model():
    model = MyBertQAModel()
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    return model, tokenizer

In [61]:
def calc_metrics(input_ids, start_logits, end_logits, start_positions, end_positions):
    """
    Compute precision, recall, and F1 score for a batch of QA predictions.

    Parameters:
        input_ids (torch.Tensor): Token IDs for the input sequences. [batch_size, seq_len]
        start_logits (torch.Tensor): Logits for predicted start indices. [batch_size, seq_len]
        end_logits (torch.Tensor): Logits for predicted end indices. [batch_size, seq_len]
        start_positions (torch.Tensor): True start indices of the answer spans. [batch_size]
        end_positions (torch.Tensor): True end indices of the answer spans. [batch_size]

    Returns:
        (float, float, float): A tuple containing the average precision, recall, and F1 score over the batch.
    """
    batch_size = start_logits.size(0)
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    total_prec, total_rec, total_f1 = 0.0, 0.0, 0.0

    for idx in range(batch_size):
        pred_start_idx = torch.argmax(start_logits[idx])
        pred_end_idx = torch.argmax(end_logits[idx])

        true_start_idx = start_positions[idx]
        true_end_idx = end_positions[idx]

        if pred_start_idx > pred_end_idx:
            continue
        predicted_tokens = tokenizer.convert_ids_to_tokens(input_ids[idx][pred_start_idx:pred_end_idx+1])
        actual_tokens = tokenizer.convert_ids_to_tokens(input_ids[idx][true_start_idx:true_end_idx+1])

        pred_counts = Counter(predicted_tokens)
        actual_counts = Counter(actual_tokens)
        true_positive = sum((pred_counts & actual_counts).values())
        false_positive = sum((pred_counts - actual_counts).values())
        false_negative = sum((actual_counts - pred_counts).values())
        precision_val = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
        recall_val = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
        f1_val = (2 * precision_val * recall_val / (precision_val + recall_val)) if (precision_val + recall_val) > 0 else 0.0

        total_prec += precision_val
        total_rec += recall_val
        total_f1 += f1_val

    avg_precision = total_prec / batch_size
    avg_recall = total_rec / batch_size
    avg_f1 = total_f1 / batch_size

    return avg_precision, avg_recall, avg_f1


In [62]:
def eval_loop(validation_data_loader, model, tokenizer, device):
    '''
    Perform evaluation over a validation dataset using a given model. This function iterates over the validation dataset, makes predictions using the model,
    and calculates the precision, recall, and F1 score for each batch.
    Parameters:
        validation_data_loader (DataLoader): DataLoader for the validation dataset.
        model (nn.Module): The trained model to be evaluated.
        tokenizer: The tokenizer used for processing the data.
        device: The device (e.g., 'cuda', 'cpu') on which the tensors should be processed.

    Returns:
        tuple: A tuple containing the average precision, recall, and F1 score for the validation set.
    '''
    model.eval()
    total_precision, total_recall, total_f1 = 0.0, 0.0, 0.0

    print("Evaluating metrics:")
    progress_bar = tqdm(range(len(validation_data_loader)))

    with torch.no_grad():
        for batch in validation_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_pos'].to(device)
            end_positions = batch['end_pos'].to(device)
            answer_labels = batch['answer_label'].to(device)

            start_logits, end_logits, answer_logits = model(input_ids, attention_mask)
            precision, recall, f1 = calc_metrics(input_ids, start_logits, end_logits, start_positions, end_positions)

            total_precision += precision
            total_recall += recall
            total_f1 += f1

            progress_bar.update(1)

    precision = total_precision / len(validation_data_loader)
    recall = total_recall / len(validation_data_loader)
    f1_score = total_f1 / len(validation_data_loader)
    return precision, recall, f1_score


In [63]:
def train_loop(train_data_loader, validation_data_loader, model, device, epochs=2, lr=5e-5):
    '''
    Conducts the training process over a specified number of epochs. This function iterates over the training data to train the model,
    computes loss using a defined function within the loop, and performs validation at the end of each epoch to monitor performance.
    Parameters:
        train_data_loader (DataLoader): DataLoader for the training dataset.
        validation_data_loader (DataLoader): DataLoader for the validation dataset.
        model (nn.Module): The model to be trained.
        device: The device (e.g., 'cuda', 'cpu') on which to process the model and data.
        epochs (int, optional): The number of epochs to train for. Defaults to 2.
        lr (float, optional): Learning rate for the optimizer. Defaults to 5e-5.

    Returns:
        list: Lists containing the average training and validation loss for each epoch.
    '''
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    def cal_loss(start_logits, end_logits, answer_logits, start_positions, end_positions, answer_labels):
        start_loss = nn.CrossEntropyLoss()(start_logits, start_positions)
        end_loss = nn.CrossEntropyLoss()(end_logits, end_positions)
        answer_loss = nn.CrossEntropyLoss()(answer_logits, answer_labels)
        return start_loss + end_loss + answer_loss

    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0
        for batch in train_data_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_pos'].to(device)
            end_positions = batch['end_pos'].to(device)
            answer_labels = batch['answer_label'].to(device)

            start_logits, end_logits, answer_logits = model(input_ids, attention_mask)
            loss = cal_loss(start_logits, end_logits, answer_logits, start_positions, end_positions, answer_labels)
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss.item()

        avg_train_loss = epoch_train_loss / len(train_data_loader)

        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for batch in validation_data_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                start_positions = batch["start_pos"].to(device)
                end_positions = batch["end_pos"].to(device)
                answer_labels = batch["answer_label"].to(device)

                start_logits, end_logits, answer_logits = model(input_ids, attention_mask)
                val_loss = cal_loss(start_logits, end_logits, answer_logits, start_positions, end_positions, answer_labels)
                epoch_val_loss += val_loss.item()

        avg_val_loss = epoch_val_loss / len(validation_data_loader)
        print(f"Epoch: {epoch}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    return [avg_train_loss], [avg_val_loss]

In [64]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 64
    max_length = 384

    model, tokenizer = load_model()
    model.to(device)
    train, validation = load_data()

    train_data_loader = preprocess_and_tokenize(train, tokenizer, max_length=max_length, batch_size=batch_size)
    validation_data_loader = preprocess_and_tokenize(validation, tokenizer, max_length=max_length, batch_size=batch_size)

    train_losses, val_losses = train_loop(train_data_loader, validation_data_loader, model, device, epochs=2, lr=5e-5)
    precision, recall, f1_score = eval_loop(validation_data_loader, model, tokenizer, device)

    print("PRECISION: ", precision)
    print("RECALL: ", recall)
    print("F1-SCORE: ", f1_score)

if __name__ == "__main__":
    main()

Epoch: 0, Train Loss: 3.8255, Val Loss: 2.7802
Epoch: 1, Train Loss: 2.1544, Val Loss: 2.4885
Evaluating metrics:


  0%|          | 0/28 [00:00<?, ?it/s]

PRECISION:  0.7052367129857312
RECALL:  0.7198980327241281
F1-SCORE:  0.6880363161681837


In [65]:
# def main():
#   '''Here's the basic structure of the main block -- feel free to add or
#   remove parameters/helper functions as you see fit, but all steps here are
#   needed and we expect to see precision, recall, and f1 scores printed out'''
#   device = "cuda" if torch.cuda.is_available() else "cpu"
#   batch_size = 64

#   model, tokenizer = load_model()
#   train, validation = load_data()

#   train_data_loader = preprocess_and_tokenize(train)
#   validation_data_loader = preprocess_and_tokenize(validation)

#   train_losses, val_losses = train_loop(train_data_loader, validation_data_loader)
#   precision, recall, f1_score  = eval_loop(validation_data_loader)

#   print("PRECISION: ", precision)
#   print("RECALL: ", recall)
#   print("F1-SCORE: ", f1_score)

# if __name__ == "__main__":
#   main()
