# Install Required Libraries

In [1]:
!pip install datasets transformers adapter-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-n

# Import Required Libraries

In [2]:
import os
import torch
import json
import collections

import numpy as np
import wandb as wandb

from tqdm.auto import tqdm
from datasets import Dataset
from transformers import (AutoTokenizer,
                          AutoModelForQuestionAnswering,
                          TrainingArguments,
                          AdapterTrainer,
                          Trainer,
                          default_data_collator)

# Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Set Working Directory for files

In [4]:
project_folder = 'hw2'
drive_path = '/content/drive/MyDrive'

os.chdir(os.path.join(drive_path, project_folder))

In [5]:
print("PWD:")
!pwd
print("List of Files:")
!ls

PWD:
/content/drive/MyDrive/hw2
List of Files:
a2.pdf	covid-qa  evaluate.py  hw2.ipynb  report.pdf


# Create Directory for Predictions and Results if not present

In [6]:
predictions_dir = "./predictions"
results_dir = "./results"

def create_dir(dir_name: str):
  """Creates a directory if it does not exist """""
  if not os.path.exists(dir_name):
      os.makedirs(dir_name)
      
create_dir(predictions_dir)
create_dir(results_dir)

# Set Device

In [7]:
def get_device():
    """Get device (if GPU is available, use GPU)"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return device

# Set Hyper-Parameters and Config

In [8]:
DATA_PATH = {
    "train": "covid-qa/covid-qa-train.json",
    "dev": "covid-qa/covid-qa-dev.json",
    "test": "covid-qa/covid-qa-test.json",
}
DEVICE = get_device()
MODEL = "deepset/roberta-base-squad2"
ADAPTER = "AdapterHub/roberta-base-pf-squad_v2"
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
MAX_LEN = 385
DOC_STRIDE = 128
EPOCHS = 3
SEED = 42
IS_FINE_TUNE = False
IS_ADAPTER = True
ADAFACTOR = False # set this to False to use Adam optimizer
ADAM_BETA1 = 0.9 # beta1 for Adam optimizer
ADAM_BETA2 = 0.999 # beta2 for Adam optimizer
ADAM_EPSILON = 1e-8 # epsilon for Adam optimizer
LR_SCHEDULER_TYPE = "linear" # learning rate scheduler type
NUM_WORKERS = 8
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8

if IS_FINE_TUNE and not IS_ADAPTER:
  NAME = "roberta_finetuned"
elif not IS_FINE_TUNE and IS_ADAPTER:
  NAME = "roberta_adapter"
elif not IS_FINE_TUNE and not IS_ADAPTER:
  NAME = "roberta_baseline"

# Helper Functions

In [9]:
def load_json_file_to_dict(file_name: str):
    """Load json file to dict."""
    return json.load(open(file_name))


def write_dict_to_json(data_dict: dict, 
                       file_name: str
                    ):
    """Write dict to json file."""
    with open(file_name, "w") as outfile:
        json.dump(data_dict, outfile)

    
def display_stage(stage_title: str):
    """Display stage title with padding."""
    total_length = 100
    title_length = len(stage_title)
    pad_count = int((total_length - title_length) / 2)
    padding = "*" * pad_count
    formatted_title = f"{padding}{stage_title}{padding}"
    print(formatted_title)


def epoch_time(start_time: float, 
               end_time: float
            ):
    """Calculate the time in minutes and seconds for each epoch."""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Pre-Process and Load Dataset

In [10]:
def preprocess_data(data_dict: dict): 
    """Pre-process the data to the format of SQuAD."""
    temp = {"id": list(), "title": list(), "context": list(), "question": list(), "answers": list()}
    for article in data_dict["data"]:
        for paragraph in article["paragraphs"]:
            for qa_pair in paragraph["qas"]:
                for ans in qa_pair["answers"]:
                    temp["answers"].append({"answer_start": [ans["answer_start"]], "text": [ans["text"]]})
                    temp["question"].append(qa_pair["question"])
                    temp["context"].append(paragraph["context"])
                    temp["title"].append(paragraph["document_id"])
                    temp["id"].append(qa_pair["id"])
    return temp.copy()


def load_data(split="dev"):
    """Load COVID-QA data."""
    data_dict = load_json_file_to_dict(DATA_PATH[split])
    dataset = Dataset.from_dict(preprocess_data(data_dict))
    return dataset

# Load the Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
pad_on_right = tokenizer.padding_side == "right"

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

# Tokenize Dataset

In [12]:
def tokenize_train_features(examples: dict):
    """Prepare training features by tokenizing and getting the start/end positions."""
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=MAX_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = list()
    tokenized_examples["end_positions"] = list()

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while (token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


def tokenize_validation_features(examples: dict):
    """Prepare validation features by tokenizing and getting the start/end positions."""""
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=MAX_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = list()

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples


def prepare_dataset(dataset: Dataset, 
                    prepare_type="train"
                ):
    """Tokenize and prepare dataset for training and evaluation."""
    if prepare_type == "train":
        return dataset.map(tokenize_train_features, batched=True, remove_columns=dataset.column_names)
    elif prepare_type == "eval":
        return dataset.map(tokenize_validation_features, batched=True, remove_columns=dataset.column_names)


dataset = load_data(split="dev")
print(prepare_dataset(dataset, prepare_type="train"))

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 4224
})


# Post-Process Predictions

In [13]:
def postprocess(examples: list, 
                               features: list, 
                               raw_predictions: tuple, 
                               n_best_size=100, 
                               max_answer_length=100
                            ):
    """Post-processes the raw predictions to output a dictionary with the start and end predictions."""
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        min_null_score = None
        valid_answers = list()
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = (start_logits[cls_index] + end_logits[cls_index])
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None):
                        continue
                    if (end_index < start_index or end_index - start_index + 1 > max_answer_length):
                        continue
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append({"score": start_logits[start_index] + end_logits[end_index], "text": context[start_char:end_char]})
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        predictions[example["id"]] = best_answer["text"]

    return predictions

# Inference on Val Set

In [14]:
def inference(trainer: Trainer, 
              features: list, 
              dataset: Dataset):
    """Perform inference on a trained model."""
    # Inference to get raw predictions.
    raw_predictions = trainer.predict(features)
    features.set_format(type=features.format["type"], columns=list(features.features.keys()),)
    # Post-process raw predictions.
    final_predictions = postprocess(dataset, features, raw_predictions.predictions)
    pred_dict = {key: value for key, value in final_predictions.items()}
    return pred_dict

In [15]:
class QuestionAnsweringTrainer(Trainer):
    """Custom trainer for question answering."""
    def __init__(self, 
                 *args, 
                 **kwargs
                ):
        super().__init__(*args, **kwargs)


class QuestionAnsweringAdapterTrainer(QuestionAnsweringTrainer, 
                                      AdapterTrainer
                                    ):
    """Custom trainer for question answering with adapters."""
    pass

# Model Training and Prediction Generation

In [16]:
def train(name: str, 
          fine_tune=False, 
          adapter=False
        ):
    """Train the model."""
    display_stage("...Load Covid QA Data...")
    dev_dataset = load_data(split="dev")
    test_dataset = load_data(split="test")
    train_dataset = (load_data(split="train") if fine_tune else dev_dataset)

    display_stage("...Tokenization...")
    tokenized_dev_dataset = prepare_dataset(dev_dataset, prepare_type="train")
    tokenized_train_dataset = (
        prepare_dataset(train_dataset, prepare_type="train")
        if fine_tune
        else tokenized_dev_dataset
    )

    # get data collator
    data_collator = default_data_collator

    display_stage("...Loading Model...")
    # load model
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL).to(DEVICE)

    # connect to wandb and initialize the run
    wandb.init(entity="kushagraseth-1996", project="nlp-203", group="covid-qa")

    # setup trainer
    args = TrainingArguments(
        name,
        do_train=True,
        do_eval=True,
        do_predict=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=WEIGHT_DECAY,
        dataloader_num_workers=NUM_WORKERS,  # Set to 0 when debugging and > 1 when running!
        report_to=["wandb"],
        local_rank=-1,
        load_best_model_at_end=True,
        seed=SEED,
        adafactor=ADAFACTOR,
        adam_beta1=ADAM_BETA1,
        adam_beta2=ADAM_BETA2,
        adam_epsilon=ADAM_EPSILON,
        lr_scheduler_type=LR_SCHEDULER_TYPE,
        logging_dir='./logs',
    )

    trainer = None
    if adapter:
        # setup adapters
        adapter_name = model.load_adapter(ADAPTER, source="hf")
        model.train_adapter(adapter_name)
        model.set_active_adapters(adapter_name)
        trainer = QuestionAnsweringAdapterTrainer(
            model,
            args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_dev_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer,
        )
        display_stage("...Training Roberta Adapter...")
        trainer.train()
    elif fine_tune:
        trainer = Trainer(
            model,
            args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_dev_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer,
        )
        display_stage("...Roberta Fine-Tuned...")
        trainer.train()
    else:
        display_stage("...Roberta Baseline...")

    # save model
    trainer.save_model(name)
    
    display_stage("...Inferencing on Dev dataset...")
    dev_features = prepare_dataset(dev_dataset, prepare_type="eval")
    dev_pred_dict = inference(trainer, dev_features, dev_dataset)
    write_dict_to_json(dev_pred_dict, f"predictions/{name}_dev_pred.json")

    display_stage("...Inferencing on Test dataset...")
    test_features = prepare_dataset(test_dataset, prepare_type="eval")
    test_pred_dict = inference(trainer, test_features, test_dataset)
    write_dict_to_json(test_pred_dict, f"predictions/{name}_test_pred.json")

train(NAME, 
      fine_tune=IS_FINE_TUNE, 
      adapter=IS_ADAPTER
    )

***************************************Load Covid QA Data...***************************************
******************************************Tokenization...******************************************


Map:   0%|          | 0/203 [00:00<?, ? examples/s]

******************************************Loading Model...******************************************


Downloading pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading (…)ad076/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading (…)076/head_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)ed69ead076/README.md:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading pytorch_model_head.bin:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

Downloading pytorch_adapter.bin:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

Model class 'RobertaModelWithHeads' of found prediction head does not match current model class.
***** Running training *****
  Num examples = 4224
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1584
  Number of trainable parameters = 896066
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


************************************Training Roberta Adapter...************************************


Epoch,Training Loss,Validation Loss
1,0.4689,0.2451
2,0.2713,0.221696
3,0.2608,0.216255


***** Running Evaluation *****
  Num examples = 4224
  Batch size = 8
Saving model checkpoint to roberta_adapter/checkpoint-528
Configuration saved in roberta_adapter/checkpoint-528/squad_v2/adapter_config.json
Module weights saved in roberta_adapter/checkpoint-528/squad_v2/pytorch_adapter.bin
Configuration saved in roberta_adapter/checkpoint-528/squad_v2/head_config.json
Module weights saved in roberta_adapter/checkpoint-528/squad_v2/pytorch_model_head.bin
tokenizer config file saved in roberta_adapter/checkpoint-528/tokenizer_config.json
Special tokens file saved in roberta_adapter/checkpoint-528/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4224
  Batch size = 8
Saving model checkpoint to roberta_adapter/checkpoint-1056
Configuration saved in roberta_adapter/checkpoint-1056/squad_v2/adapter_config.json
Module weights saved in roberta_adapter/checkpoint-1056/squad_v2/pytorch_adapter.bin
Configuration saved in roberta_adapter/checkpoint-1056/squad_v2/head_con

*************************************Inferencing on Dev dataset*************************************


Map:   0%|          | 0/203 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4224
  Batch size = 8


Post-processing 203 example predictions split into 4224 features.


  0%|          | 0/203 [00:00<?, ?it/s]

************************************Inferencing on Test dataset************************************


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 11007
  Batch size = 8


Post-processing 375 example predictions split into 11007 features.


  0%|          | 0/375 [00:00<?, ?it/s]

# Evaluate Basline Scores

### Dev Set

In [17]:
!python evaluate.py covid-qa/covid-qa-dev.json predictions/roberta_baseline_dev_pred.json --out-file results/roberta_baseline_dev_eval.json

### Test Set

In [18]:
!python evaluate.py covid-qa/covid-qa-test.json predictions/roberta_baseline_test_pred.json --out-file results/roberta_baseline_test_eval.json

# Evaluate Roberta Fine-Tuned Scores



### Dev Set

In [19]:
# !python evaluate.py covid-qa/covid-qa-dev.json predictions/roberta_finetuned_dev_pred.json --out-file results/roberta_finetuned_dev_eval.json

### Test Set

In [20]:
# !python evaluate.py covid-qa/covid-qa-test.json predictions/roberta_finetuned_test_pred.json --out-file results/roberta_finetuned_test_eval.json

# Evaluate Roberta Adapter Scores

### Dev Set

In [21]:
!python evaluate.py covid-qa/covid-qa-dev.json predictions/roberta_adapter_dev_pred.json --out-file results/roberta_adapter_dev_eval.json

### Test Set

In [22]:
!python evaluate.py covid-qa/covid-qa-test.json predictions/roberta_adapter_test_pred.json --out-file results/roberta_adapter_test_eval.json