# Setup

In [None]:
! pip install -U -q datasets
! pip install -q nltk
! pip install -q sentence-transformers
! pip install -q pandas
! pip install -q evaluate
! pip install -U -q huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [3]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd

# Finetune sentence transformer

In [None]:
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    evaluation,
)
from torch.utils.data import DataLoader
from datasets import load_dataset

In [7]:
username = "legacy107"
checkpoint = "multi-qa-mpnet-base-dot-v1-covidqa-search"
dataset_nm = "minh21/COVID-QA-sentence-transformer-data"
dataset = load_dataset(dataset_nm)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'positive', 'negative', 'document_id'],
        num_rows: 2378
    })
    test: Dataset({
        features: ['question', 'positive', 'negative', 'document_id'],
        num_rows: 269
    })
})

In [46]:
model = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1")
# model = SentenceTransformer(f"{username}/{checkpoint}")
# model = SentenceTransformer("/kaggle/working/multi-qa-mpnet-base-dot-v1-covidqa-search3/120")

In [27]:
# triplets
train_examples = []
n_examples = dataset["train"].num_rows

for i in range(n_examples):
    example = dataset["train"][i]
    train_examples.append(
        InputExample(
            texts=[example["question"], example["positive"], example["negative"]]
        )
    )

In [13]:
# pairs
train_examples = []
n_examples = dataset["train"].num_rows
train_eval = {
    "question": [],
    "context": [],
    "score": [],
}

for i in range(n_examples):
    example = dataset["train"][i]
    if example["question"] not in train_eval["question"]:
        train_eval["question"].append(example["question"])
        train_eval["context"].append(example["positive"])
        train_eval["score"].append(1.0)
        train_examples.append(
            InputExample(texts=[example["question"], example["positive"]], label=1.0)
        )

    train_examples.append(
        InputExample(texts=[example["question"], example["positive"]], label=0.0)
    )
    train_eval["question"].append(example["question"])
    train_eval["context"].append(example["negative"])
    train_eval["score"].append(0.0)

# validation set
n_examples = dataset["test"].num_rows
val_eval = {
    "question": [],
    "context": [],
    "score": [],
}

for i in range(n_examples):
    example = dataset["test"][i]
    if example["question"] not in val_eval["question"]:
        val_eval["question"].append(example["question"])
        val_eval["context"].append(example["positive"])
        val_eval["score"].append(1.0)

    val_eval["question"].append(example["question"])
    val_eval["context"].append(example["negative"])
    val_eval["score"].append(0.0)

In [40]:
batch_size = 8
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
train_loss = losses.TripletLoss(model=model)
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data

In [31]:
train_loss = losses.CosineSimilarityLoss(model)

In [33]:
# triplets
evaluator = evaluation.TripletEvaluator(
    anchors=dataset["test"]["question"],
    positives=dataset["test"]["positive"],
    negatives=dataset["test"]["negative"],
    show_progress_bar=True,
    batch_size=batch_size,
)

In [17]:
# pairs
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    val_eval["question"],
    val_eval["context"],
    val_eval["score"],
    show_progress_bar=True,
    batch_size=batch_size,
)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    output_path=f"./{checkpoint}",
    checkpoint_path=f"./{checkpoint}",
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluator=evaluator,
    evaluation_steps=20,
    save_best_model=True,
    checkpoint_save_steps=40,
)

In [None]:
# model.save_to_hub doesn't work with the latest version of huggingface_hub

from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id=checkpoint)
root_path = "/kaggle/working/"
# checkpoint_number = "/800"
checkpoint_number = ""

files_to_push_to_hub = [
    "README.md",
    "modules.json",
    "config.json",
    "config_sentence_transformers.json",
    "modules.json",
    "pytorch_model.bin",
    "sentence_bert_config.json",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "vocab.txt",
]

for filename in files_to_push_to_hub:
    api.upload_file(
        path_or_fileobj=f"{root_path}/{checkpoint}{checkpoint_number}/{filename}",
        repo_id=f"{username}/{checkpoint}",
        path_in_repo=filename,
        repo_type="model",
    )

api.upload_folder(
    folder_path=f"{root_path}/{checkpoint}/{checkpoint_number}/1_Pooling",
    path_in_repo="1_Pooling",
    repo_id=f"{username}/{checkpoint}",
    repo_type="model",
)

In [43]:
train_evaluator = evaluation.TripletEvaluator(
    anchors=dataset["train"]["question"],
    positives=dataset["train"]["positive"],
    negatives=dataset["train"]["negative"],
    show_progress_bar=True,
    batch_size=batch_size,
)

In [19]:
# pairs
train_evaluator = evaluation.EmbeddingSimilarityEvaluator(
    train_eval["question"],
    train_eval["context"],
    train_eval["score"],
    show_progress_bar=True,
    batch_size=batch_size,
)

In [None]:
train_evaluator(model, output_path="/kaggle/working/")

pairs
* 0.0791282673254723
* 0.622238080757801

triplet;
* 0.563919259882254
* 0.977291841883936

In [None]:
evaluator(model, output_path="/kaggle/working/")

pairs
* 0.1316570109113781
* 0.180733790689505

triplets
* 0.5576208178438662
* 0.724907063197026

# Evaluate

In [35]:
test_dataset = load_dataset(
    "minh21/COVID-QA-validation-sentence-transformer", split="train"
)

In [None]:
def filter_fn(row):
    for x in dataset["test"]:
        if x["question"] == row["question"] and x["document_id"] == row["document_id"]:
            return True
    return False


test_dataset = test_dataset.filter(filter_fn)

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import torch

In [4]:
# covid_qa_ds_nm = "minh21/COVID-QA-validation-sentence-transformer"
covid_qa_ds_nm = "minh21/COVID-QA-testset-data"
test_dataset = load_dataset(covid_qa_ds_nm, split="train")

In [11]:
test_dataset

Dataset({
    features: ['question', 'answer', 'context_chunks', 'document_id', 'id', 'context', 'retrieved_context'],
    num_rows: 201
})

In [35]:
# checkpoint_nm = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
checkpoint_nm = f"{username}/{checkpoint}"
# checkpoint_nm = "BAAI/bge-large-en-v1.5"
bi_encoder = SentenceTransformer(checkpoint_nm)
bi_encoder.max_seq_length = 256  # Truncate long passages to 256 tokens
top_k = 30  # Number of passages we want to retrieve with the bi-encoder

# cross_encoder_nm = "BAAI/bge-reranker-large"
# cross_encoder_nm = "BAAI/bge-large-en-v1.5"
cross_encoder_nm = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_encoder = CrossEncoder(cross_encoder_nm)

In [36]:
embedding_cache = {}


def retrieve_context(row):
    global embedding_cache

    if row["document_id"] in embedding_cache.keys():
        corpus_embeddings = embedding_cache[row["document_id"]]
    else:
        corpus_embeddings = bi_encoder.encode(
            row["context_chunks"], convert_to_tensor=True, show_progress_bar=False
        )
        embedding_cache[row["document_id"]] = corpus_embeddings

    query = row["question"]

    question_embedding = bi_encoder.encode(
        query, convert_to_tensor=True, show_progress_bar=False
    )
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]
    #     hits = sorted(hits, key=lambda x: x["score"], reverse=True)

    #     row["retrieved_context"] = " ".join(
    #         [row["context_chunks"][hit["corpus_id"]] for hit in hits[0:]]
    #     )
    #     .replace("\n", " ")
    #     return row

    cross_inp = [[query, row["context_chunks"][hit["corpus_id"]]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp, show_progress_bar=False)

    for idx in range(len(cross_scores)):
        hits[idx]["cross-score"] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)

    row["retrieved_context"] = " ".join(
        [row["context_chunks"][hit["corpus_id"]] for hit in hits[0:3]]
    )
    #     .replace("\n", " ")

    return row

In [None]:
test_dataset = test_dataset.map(retrieve_context, batched=False)

In [None]:
match = 0


def find_match(row):
    global match
    match = match + (1 if row["answer"] in row["retrieved_context"] else 0)


test_dataset.map(find_match, batched=False)

print("%.2f" % (match / test_dataset.num_rows * 100))

Train set
* pretrained: 73.96
* finetuned: 85.78

Validation set
* pretrained: 74.51
* finetuned: 45.10

Test set
* pretrained model: 64.18
* finetuned model: 45.77

Test set top 10
* pretrained model: 89.55
* finetuned model: 73.13

Test set top 20
* pretrained: 97.01
* pretrained + rerank: 89.05

Test set reranker only
* pretrained: 91.04

### BAAI/bge

Test set top 3
* pretrained: 68.66
* pretrained + rerank: 78.61

Test set top 10
* pretrained: 84.58

Test set top 15
* pretrained: 88.56
* pretrained + rerank: 78.11

Test set reranker only
* pretrained: 75.12