# Setup

In [None]:
! pip install -U -q datasets
! pip install -q nltk
! pip install -q sentence-transformers
! pip install -q pandas
! pip install -q evaluate
! pip install -U -q huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Finetune sentence transformer

In [15]:
from sentence_transformers import (
    CrossEncoder,
    InputExample,
)
from sentence_transformers.cross_encoder.evaluation import (
    CECorrelationEvaluator,
    CEBinaryClassificationEvaluator,
)
from torch.utils.data import DataLoader
from datasets import load_dataset

In [None]:
username = "legacy107"
checkpoint = "ms-marco-MiniLM-L-6-v2-covidqa-search"
dataset_nm = "minh21/COVID-QA-Chunk-64-sentence-transformer-biencoder-data-65_25_10-v2"
dataset = load_dataset(dataset_nm)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'positive', 'negative', 'document_id'],
        num_rows: 6419
    })
    test: Dataset({
        features: ['question', 'positive', 'negative', 'document_id'],
        num_rows: 723
    })
})

In [18]:
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

In [19]:
# pairs
train_examples = []
check = []
n_examples = dataset["train"].num_rows

for i in range(n_examples):
    example = dataset["train"][i]
    if (example["document_id"], example["question"]) not in check:
        check.append((example["document_id"], example["question"]))
        train_examples.append(
            InputExample(texts=[example["question"], example["positive"]], label=1)
        )

    train_examples.append(
        InputExample(texts=[example["question"], example["negative"]], label=0)
    )

# validation set
validation_examples = []
check = []
n_examples = dataset["test"].num_rows

for i in range(n_examples):
    example = dataset["test"][i]
    if (example["document_id"], example["question"]) not in check:
        check.append((example["document_id"], example["question"]))
        validation_examples.append(
            InputExample(texts=[example["question"], example["positive"]], label=1)
        )

    validation_examples.append(
        InputExample(texts=[example["question"], example["negative"]], label=0)
    )

In [20]:
batch_size = 8
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
num_epochs = 8
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data

In [21]:
evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    validation_examples, name="validation"
)

In [22]:
train_evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    train_examples, name="train"
)

In [None]:
model.fit(
    output_path=f"./{checkpoint}",
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluator=evaluator,
    evaluation_steps=100,
    save_best_model=True,
)

In [None]:
# model.save_to_hub doesn't work with the latest version of huggingface_hub

from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id=checkpoint)
root_path = "/kaggle/working/"
# checkpoint_number = "/800"
checkpoint_number = ""

files_to_push_to_hub = [
    "README.md",
    "modules.json",
    "config.json",
    "config_sentence_transformers.json",
    "modules.json",
    "pytorch_model.bin",
    "sentence_bert_config.json",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "vocab.txt",
]

for filename in files_to_push_to_hub:
    api.upload_file(
        path_or_fileobj=f"{root_path}/{checkpoint}{checkpoint_number}/{filename}",
        repo_id=f"{username}/{checkpoint}",
        path_in_repo=filename,
        repo_type="model",
    )

api.upload_folder(
    folder_path=f"{root_path}/{checkpoint}/{checkpoint_number}/1_Pooling",
    path_in_repo="1_Pooling",
    repo_id=f"{username}/{checkpoint}",
    repo_type="model",
)

In [23]:
train_evaluator(model, output_path="/kaggle/working/")

0.40422330625107056

In [None]:
pairs
0.0791282673254723
0.622238080757801

In [24]:
evaluator(model, output_path="/kaggle/working/")

0.2277415887441923

In [None]:
pairs
0.1316570109113781
0.180733790689505

# Evaluate

In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import torch

In [None]:
covid_qa_ds_nm = "minh21/COVID-QA-Chunk-64-testset-biencoder-data-65_25_10-v2"
test_dataset = load_dataset(covid_qa_ds_nm, split="train")

In [6]:
test_dataset

Dataset({
    features: ['question', 'answer', 'context_chunks', 'document_id', 'id', 'context'],
    num_rows: 203
})

In [None]:
checkpoint_nm = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# checkpoint_nm = f"{username}/{checkpoint}"
top_k = 7  # Number of passages we want to retrieve with the bi-encoder
max_tokens = 480
cross_encoder = CrossEncoder(checkpoint_nm)

In [12]:
from transformers import AutoTokenizer
import torch

model_nm = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_nm, device_map="auto")


def retrieve_context(row):
    query = row["question"]

    hits = [{"corpus_id": i} for i in range(len(row["context_chunks"]))]
    cross_inp = [[query, row["context_chunks"][hit["corpus_id"]]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp, show_progress_bar=False)

    for idx in range(len(cross_scores)):
        hits[idx]["cross-score"] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)

    context = row["context_chunks"][hits[0]["corpus_id"]]
    idx = 1
    while (
        idx < len(hits)
        and len(
            tokenizer(context + " " + row["context_chunks"][hits[idx]["corpus_id"]])[
                "input_ids"
            ]
        )
        < max_tokens
    ):
        context += " " + row["context_chunks"][hits[idx]["corpus_id"]]
        idx += 1
    row["retrieved_context"] = context
    return row

In [None]:
test_dataset = test_dataset.map(retrieve_context, batched=False)

In [14]:
match = 0


def find_match(row):
    global match
    match = match + (1 if row["answer"] in row["retrieved_context"] else 0)


test_dataset.map(find_match, batched=False)

print("%.2f" % (match / test_dataset.num_rows * 100))

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

91.63


Test set cross-encoder only
* pretrained: 91.63

Test set biencoder only
* pretrained: 79.80