## A notebook to look how the base models perform on our data without training models

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sentence_transformers import losses

# Importing BERT modules
import bert
import tensorflow as tf
import tensorflow_hub as hub
# from bert import run_classifier
from bert import optimization
from bert import tokenization

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch.optim as optim
import torch
from sentence_transformers.cross_encoder import CrossEncoder
from transformers import TrainingArguments
from transformers import Trainer

from datasets import load_metric
from datasets import load_dataset
import datasets
import tqdm
import pickle
import time
from sklearn import metrics

In [None]:
# pickled train and test files
train = 'train.pkl'
test = 'test.pkl'

with open ('train.pkl', 'rb') as f:
    df_train = pickle.load(f)
    
with open ('test.pkl', 'rb') as g:
    df_test = pickle.load(g)
    
df_train['label'] = df_train['label'].astype(np.float64)
df_test['label'] = df_test['label'].astype(np.float64)

df_train = df_train[["doc_text", "Query", "label"]]
df_test = df_test[["doc_text", "Query", "label"]]

# df_val = df_val[["doc_text", "query", "label"]]
hf_train = datasets.Dataset.from_pandas(df_train)
hf_test = datasets.Dataset.from_pandas(df_test)

hf = datasets.DatasetDict({"train": hf_train,
                           "test": hf_test})

In [None]:
# Models
# "cross-encoder/stsb-distilroberta-base"
# "cross-encoder/qnli-electra-base"

class BaseModel:
    def __init__(self, model_checkpoint, df_train, df_test, sentence_1_key, sentence_2_key, tokenizer):
        self.sentence1_key = sentence_1_key #'doc_text'
        self.sentence2_key = sentence_2_key # 'Query'
        self.model_checkpoint = model_checkpoint
        self.tokenizer = tokenizer
        self.train = df_train
        self.test = df_test
        
    def preprocess_function(self, examples):
        return self.tokenizer(str(examples[self.sentence1_key]), str(examples[self.sentence2_key]), truncation=True)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # loss_fct = nn.BCEWithLogitsLoss()
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = predictions[:, 0]
        # fpr, tpr, thresholds = metrics.roc_curve(labels, predictions, pos_label=2)
        return {"auc": metrics.roc_auc_score(y_true=labels, y_score=predictions)}

    def run_base_mode(self, hf, batch_size):
        torch.cuda.empty_cache()
        num_labels = 1
        model = AutoModelForSequenceClassification.from_pretrained(self.model_checkpoint, num_labels=num_labels)
        encoded_hf = hf.map(self.preprocess_function, remove_columns=("doc_text", "Query"), batch_size = 512)

        metric_name = "auc"
        model_name = self.model_checkpoint.split("/")[-1]

        args = TrainingArguments(
            f"{model_name}-finetuned_auc",
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=0,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model=metric_name,
            push_to_hub=True,
            eval_steps=1,
        )
        
        trainer = Trainer(
            model,
            args,
            train_dataset=encoded_hf["train"],
            eval_dataset=encoded_hf["test"],
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
            # compute_loss=compute_loss
        )
        
        trainer.evaluate()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base", use_fast=True)

In [None]:
base_model = BaseModel("cross-encoder/qnli-electra-base", df_train, df_test, "doc_text", "Query", tokenizer)

In [None]:
base_model.run_base_mode(hf, 8)