## NOTE

The code in this script is only partially done - it was an initial draft of the implementation. The implementation has since been fleshed out and moved to the Python files.

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/kapilrk04/cache'
os.environ['HF_DATASETS_CACHE']="/scratch/kapilrk04/cache"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
device

In [None]:
!pip install transformers datasets sentencepiece

In [None]:
!pip install accelerate -U

In [None]:
!pip install evaluate

In [None]:
!pip install pandas numpy

In [None]:
import pandas as pd
asnq_dev = pd.read_csv("/home2/kapilrk04/anlp_proj/data_sets/asnq/dev.tsv", sep="\t", names=["sentence1", "sentence2", "label"])

In [None]:
asnq_dev[:15]

In [None]:
pd.value_counts(asnq_dev["label"])

In [None]:
asnq_train = pd.read_csv("/home2/kapilrk04/anlp_proj/data_sets/asnq/train.tsv", sep="\t", names=["sentence1", "sentence2", "label"])

In [None]:
asnq_train[:15]

In [None]:
pd.value_counts(asnq_train["label"])

In [None]:
trainNeg = asnq_train[asnq_train['label']==3].sample(frac=0.25)
trainNeg.loc[:,'label'] = 0
trainPos = asnq_train[asnq_train['label']==4]
trainPos.loc[:,'label'] = 1

train_set = pd.concat([trainNeg, trainPos])
train_set['idx'] = range(1, len(train_set) + 1)

In [None]:
train_set.head()

In [None]:
pd.value_counts(train_set["label"])

In [None]:
devNeg = asnq_dev[asnq_dev['label']==3].sample(frac=0.25)
devNeg.loc[:,'label'] = 0
devPos = asnq_dev[asnq_dev['label']==4]
devPos.loc[:,'label'] = 1

dev_set = pd.concat([devNeg, devPos])
dev_set['idx'] = range(1, len(dev_set) + 1)

In [None]:
dev_set.head()

In [None]:
pd.value_counts(dev_set["label"])

In [None]:
from datasets import Dataset, load_dataset

train_dataset = Dataset.from_pandas(train_set)
dev_dataset = Dataset.from_pandas(dev_set)

In [None]:
train_dataset

In [None]:
train_dataset = train_dataset.remove_columns('__index_level_0__')
dev_dataset = dev_dataset.remove_columns('__index_level_0__')

In [None]:
dev_dataset

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_dev_dataset = dev_dataset.map(preprocess_function, batched=True)

In [None]:
encoded_dev_dataset

In [None]:
encoded_train_dataset[:5]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

In [None]:
import numpy as np
from sklearn.metrics import average_precision_score

def split_array_by_number(arr, number):
    result = []
    current_split = []
    
    for item in arr:
        if item == number:
            if current_split:
                result.append(current_split)
                return current_split
        else:
            current_split.append(item)
    if current_split:
        result.append(current_split)
    
    return result

def compute_metrics(eval_pred):
    predictions, labels, inputs = eval_pred
    
    splitnum = 0
    if model_name == "roberta-base":
        splitnum = 2
    elif model_name == "bert-base-uncased":
        splitnum = 102
    elif model_name == "albert-base-v2":
        splitnum = 3
    elif model_name == "distilbert-base-uncased":
        splitnum = 102

    per_qn_inputs = {}

    for i in range(len(inputs)):
        split_inputs = split_array_by_number(inputs[i], splitnum)
        qn = tuple(split_inputs)
        if qn not in per_qn_inputs:
            per_qn_inputs[qn] = {}
            per_qn_inputs[qn]["predictions"] = []
            per_qn_inputs[qn]["labels"] = []
        per_qn_inputs[qn]["predictions"].append(predictions[i])
        per_qn_inputs[qn]["labels"].append(labels[i])

    avg_prec_scores = []
    enc = OneHotEncoder(sparse=False)
    labels = enc.fit_transform(np.array(labels).reshape(-1,1))

    reciprocal_ranks = []

    for qn in per_qn_inputs:
        if per_qn_inputs[qn]["labels"].count(1) == 0 or per_qn_inputs[qn]["labels"].count(0) == 0:
            continue
        per_qn_inputs[qn]['predictions'] = np.array(per_qn_inputs[qn]['predictions'])
        per_qn_inputs[qn]['labels'] = enc.fit_transform(np.array(per_qn_inputs[qn]['labels']).reshape(-1,1))
        
        #print(per_qn_inputs[qn]['predictions'], per_qn_inputs[qn]['labels'])
        avg_prec_scores.append(average_precision_score(per_qn_inputs[qn]["labels"], per_qn_inputs[qn]["predictions"]))

        true_label = per_qn_inputs[qn]["labels"]
        pred_label = per_qn_inputs[qn]["predictions"]

        sorted_pred_label = np.argsort(pred_label)[::-1]

        for j in range(len(sorted_pred_label)):
            row = sorted_pred_label[j]
            rank = np.where(row == 1)[0]
            if rank.size > 0:
                reciprocal_ranks.append(1/(rank[0]+1))
                break
    
    
    map_score = np.mean(avg_prec_scores)
    mrr_score = np.mean(reciprocal_ranks)

    return {
        "mAP" : map_score,
        "mRR" : mrr_score
    }

In [None]:
encoded_train_dataset[:15]

In [None]:
model_name = "roberta-base"
batch_size = 16

args = TrainingArguments(
    output_dir=f"/scratch/kapilrk04/{model_name}_transfer_(epochs={9})",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    include_inputs_for_metrics = True,
    num_train_epochs=9,
    weight_decay=0.01,
    fp16=False,
    report_to="wandb",
    run_name=f"{model_name}_transfer_(epochs={9})",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()