In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/kapilrk04/cache'
os.environ['HF_DATASETS_CACHE']="/scratch/kapilrk04/cache"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import Dataset, load_dataset

wiki_qa_dataset = load_dataset("wiki_qa")

wiki_qa_dataset["train"][:5]

In [None]:
wiki_qa_set = {
    "train" : {},
    "validation" : {},
    "test" : {}
}

for split in ["train", "validation", "test"]:
    for example in wiki_qa_dataset[split]:
        if example["question_id"] not in wiki_qa_set[split]:
            wiki_qa_set[split][example["question_id"]] = {
                "question" : example["question"],
                "answers" : [],
                "labels" : [],
                "sum_labels" : 0
            }
        wiki_qa_set[split][example["question_id"]]["answers"].append(example["answer"])
        wiki_qa_set[split][example["question_id"]]["labels"].append(example["label"])
        wiki_qa_set[split][example["question_id"]]["sum_labels"] += example["label"]


In [None]:
import pandas as pd

wiki_qa_trainp = [{"sentence1" : wiki_qa_set["train"][qn]["question"], "sentence2" : wiki_qa_set["train"][qn]["answers"][i], "label" : wiki_qa_set["train"][qn]["labels"][i]} for qn in wiki_qa_set["train"] for i in range(len(wiki_qa_set["train"][qn]["answers"])) if wiki_qa_set["train"][qn]["sum_labels"] > 0 and wiki_qa_set["train"][qn]["sum_labels"] < len(wiki_qa_set["train"][qn]["labels"])]
wiki_qa_validationp = [{"sentence1" : wiki_qa_set["validation"][qn]["question"], "sentence2" : wiki_qa_set["validation"][qn]["answers"][i], "label" : wiki_qa_set["validation"][qn]["labels"][i]} for qn in wiki_qa_set["validation"] for i in range(len(wiki_qa_set["validation"][qn]["answers"])) if wiki_qa_set["validation"][qn]["sum_labels"] > 0 and wiki_qa_set["validation"][qn]["sum_labels"] < len(wiki_qa_set["validation"][qn]["labels"])]
wiki_qa_testp = [{"sentence1" : wiki_qa_set["test"][qn]["question"], "sentence2" : wiki_qa_set["test"][qn]["answers"][i], "label" : wiki_qa_set["test"][qn]["labels"][i]} for qn in wiki_qa_set["test"] for i in range(len(wiki_qa_set["test"][qn]["answers"])) if wiki_qa_set["test"][qn]["sum_labels"] > 0 and wiki_qa_set["test"][qn]["sum_labels"] < len(wiki_qa_set["test"][qn]["labels"])]

wiki_qa_trainp = pd.DataFrame(wiki_qa_trainp)
wiki_qa_validationp = pd.DataFrame(wiki_qa_validationp)
wiki_qa_testp = pd.DataFrame(wiki_qa_testp)

In [None]:
wiki_qa_trainp['idx'] = range(1, len(wiki_qa_trainp)+1)
wiki_qa_validationp['idx'] = range(1, len(wiki_qa_validationp)+1)
wiki_qa_testp['idx'] = range(1, len(wiki_qa_testp)+1)

In [None]:
wiki_train_ds = Dataset.from_pandas(wiki_qa_trainp)
wiki_test_ds = Dataset.from_pandas(wiki_qa_testp)
wiki_valid_ds = Dataset.from_pandas(wiki_qa_validationp)

print(len(wiki_train_ds), len(wiki_test_ds), len(wiki_valid_ds))

In [None]:
wiki_train_ds

In [None]:
model_name = "distilbert-base-uncased"

In [None]:
model_checkpoints = {
    "distilbert-base-uncased": "/scratch/kapilrk04/best-distilbert/checkpoint-21468",
    "roberta-base": "/scratch/kapilrk04/best-roberta/checkpoint-21468",
    "bert-base-uncased": "/scratch/kapilrk04/best-bert/checkpoint-37569",
    "albert-base-v2": "/scratch/kapilrk04/best-albert/checkpoint-16101"
}

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

# encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
# encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
# encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

In [None]:
tokenizer('SEP')

In [None]:
def split_array_by_number(arr, number):
    result = []
    current_split = []
    
    for item in arr:
        if item == number:
            if current_split:
                result.append(current_split)
                return current_split
        else:
            current_split.append(item)
    if current_split:
        result.append(current_split)
    
    return result

In [None]:
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

def compute_metrics(eval_pred):
    predictions, labels, inputs = eval_pred
    
    splitnum = 0
    if model_name == "roberta-base":
        splitnum = 2
    elif model_name == "bert-base-uncased":
        splitnum = 102
    elif model_name == "albert-base-v2":
        splitnum = 3
    elif model_name == "distilbert-base-uncased":
        splitnum = 102

    per_qn_inputs = {}

    for i in range(len(inputs)):
        split_inputs = split_array_by_number(inputs[i], splitnum)
        qn = tuple(split_inputs)
        if qn not in per_qn_inputs:
            per_qn_inputs[qn] = {}
            per_qn_inputs[qn]["predictions"] = []
            per_qn_inputs[qn]["labels"] = []
            per_qn_inputs[qn]["sum_labels"] = 0
        per_qn_inputs[qn]["predictions"].append(predictions[i])
        per_qn_inputs[qn]["labels"].append(labels[i])
        per_qn_inputs[qn]["sum_labels"] += labels[i]

    avg_prec_scores = []
    enc = OneHotEncoder(sparse=False)
    labels = enc.fit_transform(np.array(labels).reshape(-1,1))

    reciprocal_ranks = []

    for qn in per_qn_inputs:
        if per_qn_inputs[qn]["sum_labels"] == 0 or per_qn_inputs[qn]["sum_labels"] == len(per_qn_inputs[qn]["labels"]):
            continue
        per_qn_inputs[qn]['predictions'] = np.array(per_qn_inputs[qn]['predictions'])
        per_qn_inputs[qn]['labels'] = enc.fit_transform(np.array(per_qn_inputs[qn]['labels']).reshape(-1,1))

        #print(per_qn_inputs[qn]['predictions'], per_qn_inputs[qn]['labels'])
        avg_prec_scores.append(average_precision_score(per_qn_inputs[qn]["labels"], per_qn_inputs[qn]["predictions"]))

        true_label = per_qn_inputs[qn]["labels"]
        pred_label = per_qn_inputs[qn]["predictions"]

        sorted_pred_label = np.argsort(pred_label)[::-1]

        for j in range(len(sorted_pred_label)):
            row = sorted_pred_label[j]
            rank = np.where(row == 1)[0]
            if rank.size > 0:
                reciprocal_ranks.append(1/(rank[0]+1))
                break
    
    
    map_score = np.mean(avg_prec_scores)
    mrr_score = np.mean(reciprocal_ranks)
    
    print("mAP: ", map_score)
    print("mRR: ", mrr_score)
    return {
        "mAP" : map_score,
        "mRR" : mrr_score
    }

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

In [None]:
#getting model params
for model_name in ["bert-base-uncased", 'roberta-base', 'distilbert-base-uncased', 'albert-base-v2']:
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)
    num_parameters = sum(p.numel() for p in model.parameters())
    print(f"Number of model parameters in {model_name}: {num_parameters}")

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Check stability

In [None]:
def train_loop(epochs):
    batch_size = 8

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)
    base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
    encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
    encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

    args1 = TrainingArguments(
        output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_wikiqa",
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=1e-6,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        include_inputs_for_metrics = True,
        num_train_epochs=epochs,
        weight_decay=0.01,
        fp16=False,
        report_to="wandb",
        run_name=f"tanda-{model_name}-eval-wikiqa"
    )
    
    args2 = TrainingArguments(
        output_dir=f"/scratch/kapilrk04/{model_name}_ft_model_wikiqa",
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=1e-6,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        include_inputs_for_metrics = True,
        num_train_epochs=epochs,
        weight_decay=0.01,
        fp16=False,
        report_to="wandb",
        run_name=f"ft-{model_name}-eval-wikiqa"
    )

    trainer1 = Trainer(
        model,
        args1,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_dev_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer11 = Trainer(
        base_model,
        args2,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_dev_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer2 = Trainer(
        model,
        args1,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer21 = Trainer(
        base_model,
        args2,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    print("BERT FT")
    trainer11.train()
    trainer21.evaluate()
    
    print("TANDA")
    trainer1.train()
    trainer2.evaluate()
    

In [None]:
for epoch in range(1, 7):
    print("Epoch ", epoch)
    train_loop(epoch)

In [None]:
ft_maps = []

## For WikiQA

In [None]:
batch_size = 8

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-wikiqa"
)

trainer1 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
device

### DISTILBERT

#### Evaluation without Adapt Step

In [None]:
trainer2.evaluate()

### Adapt Step 

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### RoBERTa

#### Evaluation

In [None]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-wikiqa"
)

trainer1 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer2.evaluate()

### Adapt Step

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### BERT-base

#### Evaluation

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-wikiqa"
)

trainer1 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### Evaluation without adapt

In [None]:
trainer2.evaluate()

### Adapt Step

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### ALBERT

In [None]:
model_name = "albert-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_train_dataset = wiki_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wiki_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wiki_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-wikiqa"
)

trainer1 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### Evaluation without adapt

In [None]:
trainer2.evaluate()

#### ADAPT then eval

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

## For TrecQA

In [None]:
trecqa_train = pd.read_csv("/home2/kapilrk04/anlp_proj/data_sets/TrecQA/train.tsv", sep="\t", names=["label", "sentence1", "sentence2"])
trecqa_train.head()

In [None]:
trecqa_test = pd.read_csv("/home2/kapilrk04/anlp_proj/data_sets/TrecQA/test.tsv", sep="\t", names=["label", "sentence1", "sentence2"])
trecqa_test.head()

In [None]:
trecqa_dev = pd.read_csv("/home2/kapilrk04/anlp_proj/data_sets/TrecQA/dev.tsv", sep="\t", names=["label", "sentence1", "sentence2"])
trecqa_dev.head()

In [None]:
trecqa_train['idx'] = range(1, len(trecqa_train)+1)
trecqa_dev['idx'] = range(1, len(trecqa_dev)+1)
trecqa_test['idx'] = range(1, len(trecqa_test)+1)

In [None]:
pd.value_counts(trecqa_train['label'])

In [None]:
pd.value_counts(trecqa_dev['label'])

In [None]:
pd.value_counts(trecqa_test['label'])

In [None]:
trecqa_train_ds = Dataset.from_pandas(trecqa_train)
trecqa_test_ds = Dataset.from_pandas(trecqa_test)
trecqa_valid_ds = Dataset.from_pandas(trecqa_dev)

In [None]:
encoded_trecqa_train_ds = trecqa_train_ds.map(preprocess_function, batched=True)
encoded_trecqa_test_ds = trecqa_test_ds.map(preprocess_function, batched=True)
encoded_trecqa_valid_ds = trecqa_valid_ds.map(preprocess_function, batched=True)

### DISTILBert 

In [None]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_trecqa_train_ds = trecqa_train_ds.map(preprocess_function, batched=True)
encoded_trecqa_test_ds = trecqa_test_ds.map(preprocess_function, batched=True)
encoded_trecqa_valid_ds = trecqa_valid_ds.map(preprocess_function, batched=True)

batch_size = 8

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_trecqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-trecqa"
)

trainer1 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### Eval without Adapt

In [None]:
trainer2.evaluate()

#### Adapt and eval

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### RoBERTa

In [None]:
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_trecqa_train_ds = trecqa_train_ds.map(preprocess_function, batched=True)
encoded_trecqa_test_ds = trecqa_test_ds.map(preprocess_function, batched=True)
encoded_trecqa_valid_ds = trecqa_valid_ds.map(preprocess_function, batched=True)

batch_size = 8

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_trecqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-trecqa"
)

trainer1 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### evaluate without adapt

In [None]:
trainer2.evaluate()

#### adapt then evaluate

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### BERT-base

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_trecqa_train_ds = trecqa_train_ds.map(preprocess_function, batched=True)
encoded_trecqa_test_ds = trecqa_test_ds.map(preprocess_function, batched=True)
encoded_trecqa_valid_ds = trecqa_valid_ds.map(preprocess_function, batched=True)

batch_size = 8

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_trecqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-trecqa"
)

trainer1 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### evaluate without adapt

In [None]:
trainer2.evaluate()

#### adapt then evaluate

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### ALBERT

In [None]:
model_name = "albert-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)

encoded_trecqa_train_ds = trecqa_train_ds.map(preprocess_function, batched=True)
encoded_trecqa_test_ds = trecqa_test_ds.map(preprocess_function, batched=True)
encoded_trecqa_valid_ds = trecqa_valid_ds.map(preprocess_function, batched=True)

batch_size = 8

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_trecqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-trecqa"
)

trainer1 = Trainer(
    model,
    args1,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2 = Trainer(
    model,
    args2,
    train_dataset=encoded_trecqa_train_ds,
    eval_dataset=encoded_trecqa_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### evaluate without adapt

In [None]:
trainer2.evaluate()

#### adapt then evaluate

In [None]:
trainer1.train()

In [None]:
trainer2.evaluate()

### TESTING MODEL ROBUST-NESS

- By incorrectly labeling a portion of the labels

In [None]:
import random

def inject_random_noise(df, noise_level=0.2):
    noisy_df = df.copy()
    num_samples_to_swap = int(len(noisy_df) * noise_level)
    swap_indices = random.sample(range(len(noisy_df)), num_samples_to_swap)

    for index in swap_indices:
        row = noisy_df.iloc[index]
        if row['label'] == 1:
            noisy_df.at[index, 'label'] = 0
        else:
            noisy_df.at[index, 'label'] = 1
    
    return noisy_df

#### WikiQA

In [None]:
noisy_wiki_qa_trainp = inject_random_noise(wiki_qa_trainp, 0.2)

In [None]:
wikiqa_train_ds = Dataset.from_pandas(noisy_wiki_qa_trainp)
wikiqa_test_ds = Dataset.from_pandas(wiki_qa_testp)
wikiqa_valid_ds = Dataset.from_pandas(wiki_qa_validationp)

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
base_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

encoded_train_dataset = wikiqa_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wikiqa_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wikiqa_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-noisy-wikiqa"
)

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_base-model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-base-{model_name}-eval-noisy-wikiqa"
)


trainer11 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=base_tokenizer,
    compute_metrics=compute_metrics,
)

trainer12 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=base_tokenizer,
    compute_metrics=compute_metrics,
)

trainer21 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer22 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### baseline: adapt then evaluate

In [None]:
trainer11.train()

In [None]:
trainer12.evaluate()

#### transferred model - adapt then evaluate

In [None]:
trainer21.train()

In [None]:
trainer22.evaluate()

#### noise 0.1


In [None]:
noisy_wiki_qa_trainp = inject_random_noise(wiki_qa_trainp, 0.1)

In [None]:
pd.value_counts(noisy_wiki_qa_trainp['label'])

In [None]:
pd.value_counts(wiki_qa_validationp['label'])

In [None]:
pd.value_counts(wiki_qa_testp['label'])

In [None]:
wikiqa_train_ds = Dataset.from_pandas(noisy_wiki_qa_trainp)
wikiqa_test_ds = Dataset.from_pandas(wiki_qa_testp)
wikiqa_valid_ds = Dataset.from_pandas(wiki_qa_validationp)

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
base_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

encoded_train_dataset = wikiqa_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wikiqa_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wikiqa_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-noisy-wikiqa"
)

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_base-model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-base-{model_name}-eval-noisy-wikiqa"
)


trainer11 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer12 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer21 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer22 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### baseline: adapt then evaluate

In [None]:
trainer11.train()

In [None]:
trainer12.evaluate()

#### transferred model - adapt then evaluate

In [None]:
trainer21.train()

In [None]:
trainer22.evaluate()

#### no noise

In [None]:
pd.value_counts(wiki_qa_trainp['label'])

In [None]:
pd.value_counts(wiki_qa_validationp['label'])

In [None]:
pd.value_counts(wiki_qa_testp['label'])

In [None]:
wikiqa_train_ds = Dataset.from_pandas(noisy_wiki_qa_trainp)
wikiqa_test_ds = Dataset.from_pandas(wiki_qa_testp)
wikiqa_valid_ds = Dataset.from_pandas(wiki_qa_validationp)

In [None]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[model_name], use_fast=True)
base_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoints[model_name], num_labels=2)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

encoded_train_dataset = wikiqa_train_ds.map(preprocess_function, batched=True)
encoded_dev_dataset = wikiqa_valid_ds.map(preprocess_function, batched=True)
encoded_test_dataset = wikiqa_test_ds.map(preprocess_function, batched=True)

batch_size = 8

args1 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=6,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-{model_name}-eval-noisy-wikiqa"
)

args2 = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_adapt_base-model_noisy_wikiqa",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics = True,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"tanda-base-{model_name}-eval-noisy-wikiqa"
)


trainer11 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer12 = Trainer(
    base_model,
    args1,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer21 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer22 = Trainer(
    model,
    args2,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#### baseline: adapt then evaluate

In [None]:
trainer11.train()

In [None]:
trainer12.evaluate()

#### transferred model - adapt then evaluate

In [None]:
trainer21.train()

In [None]:
trainer22.evaluate()