#### Embedding evaluation + Fine-tuning
In this file we:
* Evaluate selected embedding models on our synthetic dataset
* Train in-house embedding models
* Evaluate trained models

Results can be seen in *-embedding-evaluation.csv*

In [6]:
model_1 = "google-bert/bert-base-multilingual-cased"  # Hugging Face model ID
output_dir_1 = "./models/RAG-bert-base-multilingual-cased"

model_2 = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # Hugging Face model ID
output_dir_2 = "./models/RAG-paraphrase-multilingual-MiniLM-L12-v2"

model_3 = "intfloat/multilingual-e5-small"  # Hugging Face model ID
output_dir_3 = "./models/RAG-multilingual-e5-small"

model_4 = "BAAI/bge-m3"  # Hugging Face model ID
output_dir_4 = "./models/RAG-bge-m3"


In [4]:
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim
import csv


# load test dataset
test_dataset = load_dataset("json", data_files="../2_datasets/RAG_test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="../2_datasets/RAG_train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)  # Our queries (qid => question)
 
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]

evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        show_progress_bar=True,
        score_functions={"cosine": cos_sim})

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
def writeToCSV(model_name, results):
    row = {
        'model': model_name,
        'recall@1': results['cosine_recall@1'],
        'recall@3': results['cosine_recall@3'], 
        'recall@5': results['cosine_recall@5'],
        'mrr': results['cosine_mrr@10']
    }

    with open('-embedding-evaluation.csv','a', newline='') as file:
        fields = ['model', 'recall@1', 'recall@3', 'recall@5', 'mrr']
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writerow(row)

def evaluate(model_id):
    model = SentenceTransformer(model_id)   
    results = evaluator(model)
    writeToCSV(model_id.split('/')[-1], results)

In [6]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformerTrainer

def train__Embeddings(model_id, output_dir):
    model = SentenceTransformer(model_id) 
    
    train_loss = MultipleNegativesRankingLoss(model)
    
    # load train dataset again
    train_dataset = load_dataset("json", data_files="./dataset/RAG_train_dataset.json", split="train")
    
    # define training arguments
    args = SentenceTransformerTrainingArguments(
        output_dir=output_dir, # output directory and hugging face model ID
        num_train_epochs=3,                         # number of epochs
        per_device_train_batch_size=16,             # train batch size
        per_device_eval_batch_size=16,              # evaluation batch size
        learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
        eval_strategy="epoch",                      # evaluate after each epoch
        logging_steps=10,                           # log every 10 steps
    )
    
    trainer = SentenceTransformerTrainer(
        model=model, 
        args=args,  # training arguments
        train_dataset=train_dataset.select_columns(
            ["positive", "anchor"]
        ),  # training dataset
        loss=train_loss,
        evaluator=evaluator)
    
    # start training, the model will be automatically saved to the hub and the output directory
    trainer.train()
    
    # save the best model
    trainer.save_model()

In [None]:
# Only execute this cell if you want to train models!!

train__Embeddings(model_1, output_dir_1)

train__Embeddings(model_2, output_dir_2)

train__Embeddings(model_3, output_dir_3)

train__Embeddings(model_4, output_dir_4)

In [9]:
# google-bert/bert-base-multilingual-cased
# RAG-bert-base-multilingual-cased
evaluate(model_1)
evaluate(output_dir_1)

# sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
# RAG-paraphrase-multilingual-MiniLM-L12-v2
evaluate(model_2)
evaluate(output_dir_2)

# intfloat/multilingual-e5-small
# RAG-multilingual-e5-small
evaluate(model_3)
evaluate(output_dir_3)

# BAAI/bge-m3
# RAG-bge-m3
evaluate(model_4)
#evaluate(output_dir_4)

KeyboardInterrupt: 