## A notebook for training our models

We train with:

| Training data | Test data | Batch size | Learning rate | Weight decay | 
| --- | --- | --- |
| 20000 | 5000 | 8 | 0.00002 | 0.01 |
| 20000 | 5000 | 8 | 0.0003 | 0.01 |
| 20000 | 5000 | 16 | 0.0003 | 0.01 |
| 20000 | 5000 | 4 |0.0003 | 0.01 |
| 20000 | 5000 | 4 | 0.00001 | 0.01 |

In [None]:
!pip install -U sentence-transformers
!pip install -U rank_bm25 bert-tensorflow tensorflow datasets

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!sudo apt-get update

In [None]:
!sudo apt-get -y install curl

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash

In [None]:
!sudo apt-get install git-lfs

In [None]:
!git lfs install

In [None]:
!pip install tensorflow_hub

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sentence_transformers import losses

# Importing BERT modules
import bert
import tensorflow as tf
import tensorflow_hub as hub
# from bert import run_classifier
from bert import optimization
from bert import tokenization

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch.optim as optim
from sentence_transformers.cross_encoder import CrossEncoder
from transformers import TrainingArguments
from transformers import Trainer

from datasets import load_metric
from datasets import load_dataset
import datasets
import tqdm

## Pre-training

In [None]:
#Model
model_checkpoint = "cross-encoder/stsb-TinyBERT-L-4"

#Model parameter
batch_size = 8

#Tokenization parameters
sentence1_key = 'doc_text'
sentence2_key = 'Query'

# Top 3 dataset train and test
train = 'train_top3.csv'
test = 'test_top3.csv'

df_train_full = pd.read_csv(train)
df_test_full = pd.read_csv(test)

In [None]:
#Shuffle train set and select 40000
df_train_shuffled = df_train_full.sample(frac = 1)
df_train = df_train_shuffled[:40000]

In [None]:
#Shuffle test set and select 5000
df_test_shuffled = df_test_full.sample(frac = 1)
df_test = df_test_shuffled[:5000]

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
df_train['label'] = df_train['label'].astype(np.float64)
df_test['label'] = df_test['label'].astype(np.float64)

df_train['doc_text'] = df_train['doc_text'].astype(np.str)
df_test['doc_text'] = df_test['doc_text'].astype(np.str)

df_train['Query'] = df_train['Query'].astype(np.str)
df_test['Query'] = df_test['Query'].astype(np.str)

df_train = df_train[["doc_text", "Query", "label"]]
df_test = df_test[["doc_text", "Query", "label"]]

In [None]:
#Create hf dataset
hf_train = datasets.Dataset.from_pandas(df_train)
hf_test = datasets.Dataset.from_pandas(df_test)

hf = datasets.DatasetDict({"train": hf_train,
                           "test": hf_test})

In [None]:
def preprocess_function(examples):
    return tokenizer(str(examples[sentence1_key]), str(examples[sentence2_key]), truncation=True)

In [None]:
encoded_hf = hf.map(preprocess_function, remove_columns=("doc_text", "Query"), batch_size = 512)

## Training

In [None]:
num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
from sklearn import metrics
metric_name = "auc"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned_auc_20000-top3-BCE",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
    eval_steps=1,
)

In [None]:
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get('logits')

    loss_fct = nn.BCELoss()
    loss = loss_fct(logits.view(-1, self.model.config.num_labels),labels.float().view(-1, self.model.config.num_labels))
    return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    # fpr, tpr, thresholds = metrics.roc_curve(labels, predictions, pos_label=2)
    return {"auc": metrics.roc_auc_score(y_true=labels, y_score=predictions)}

In [None]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_hf["train"],
    eval_dataset=encoded_hf["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
#     compute_loss=compute_loss
)

In [None]:
import torch
torch.cuda.is_available()

In [None]:
trainer.train()

## Evaluation

In [None]:
dev_data = 'dev_data.csv'
df = pd.read_csv(dev_data)

In [None]:
model_path = "stsb-TinyBERT-L-4-finetuned_auc_40000-top3-BCE"
model2 = CrossEncoder(model_path)

In [None]:
scores_base = model2.predict(df[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
df.head()

In [None]:
df["score_top3"] = scores_base
df.head()

In [None]:
# Split the long document texts overlapping = 20 
def get_split480(text1):
    l_total = []
    l_parcial = []
    if len(text1.split())// 479>0:
        n = len(text1.split())//479
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text1.split()[:479]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text1.split()[w*459:w*459 + 479]
            l_total.append(" ".join(l_parcial))
    return l_total

# Split the document text
df['text_split1'] = df['doc_text'].apply(get_split480)
#df_all_docs.head()

docs_l = []
label_l = []
index_l =[]
query_l = []
query_n = []
doc_n = []
score_b = []

for idx,row in df.iterrows():
    for l in row['text_split1']:
        docs_l.append(l)
        label_l.append(row['label'])
        query_l.append(row['Query'])
        doc_n.append(row['doc_number'])
        query_n.append(row['Query_number'])
        
        score_b.append(row['score_top3'])
        index_l.append(idx)
len(docs_l), len(label_l), len(index_l)

# Create a new dataframe with the splitted documents
df_chunked = pd.DataFrame({"doc_text":docs_l, 'label':label_l, 'Query': query_l, 
                       "doc_number": doc_n, "query_number": query_n,
                          "score_top3": score_b})


In [None]:
scores_base2 = model2.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)

In [None]:
df_chunked["score_top3"] = scores_base2
df.head()

In [None]:
grouped = pd.DataFrame(df_chunked.groupby(by=["query_number", "doc_number", "label"]).max(["score_top3"])).reset_index()
metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score_top3"])