# Finetune embeddings using LLM-generated data

In [1]:
import os
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain_ollama.chat_models import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from llm_eval.resources import prompts as pr
from llm_eval.resources.rag import generate_qa_couples, eval_qa_couples, answer_with_rag
from llm_eval.config import global_config as glob
from llm_eval.config.config import model_list
# from importlib import reload
from pprint import PrettyPrinter

pd.set_option("display.max_colwidth", None)

In [3]:
# Chat models available
google_chat = model_list["chat_model"].get("google")
ollama_chat = model_list["chat_model"].get("ollama")

PrettyPrinter(width=10).pprint(google_chat)
PrettyPrinter(width=10).pprint(ollama_chat)

{'Gemini 1.5 Flash': 'gemini-1.5-flash',
 'Gemini 1.5 Pro': 'gemini-1.5-pro',
 'Gemini 2.0 Flash': 'gemini-2.0-flash-001',
 'Gemini 2.0 Flash Thinking': 'gemini-2.0-flash-thinking-exp-01-21',
 'Gemini 2.0 Flash-Lite': 'gemini-2.0-flash-lite-preview-02-05',
 'Gemini 2.0 Pro': 'gemini-2.0-pro-exp-02-05'}
{"Meta's Llama 3.2 3B": 'llama3.2',
 'Mistral 7B': 'mistral:7b'}


In [3]:
filename = "20240731_Nemetschek SE_Mitarbeiterhandbuch_Employee Handbook.pdf"

loader = PyPDFLoader(f"{glob.DATA_PKG_DIR}/{filename}")

raw_data = loader.load()

In [4]:
# Split the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)

docs_processed = text_splitter.split_documents(raw_data)

## Generate LLM annotated validation set:

In [5]:
# Construct QA generation prompt
prompt = PromptTemplate(template=pr.QA_generation_prompt, input_variables=["context"])
print(prompt.template)


Your task is to write a factoid question and an answer based on the given context.
The factoid question should be answerable with a specific, concise piece of factual information from the context.
Formulate the factoid question in the same style as questions users might ask in a search engine.
Do NOT include phrases like "according to the passage" or "context" in your question.

Provide your output in the following format:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question; Maximum 300 characters long)

Here is the context:

Context: {context}

Output:::



In [None]:
llm = ChatOllama(model=ollama_chat[list(ollama_chat.keys())[0]], temperature=0.5)

n_generations=50

# Generate question/answer pairs
outputs = generate_qa_couples(docs_processed, prompt, llm, n_generations, with_replacement=True)

Generating 50 QA couples...
Sampling with replacement


  0%|          | 0/50 [00:00<?, ?it/s]

In [7]:
# question-focused prompts:
prompt_groundedness = PromptTemplate(
    template=pr.question_groundedness_critique_prompt,
    input_variables=["question", "context"],
)

print("Question groundedness prompt:")
print(prompt_groundedness.template)

prompt_relevancy = PromptTemplate(
    template=pr.question_relevance_critique_prompt, input_variables=["question"]
)

print("Question relevancy prompt:")
print(prompt_relevancy.template)

Question groundedness prompt:

You will be given a context and a question.
Your task is to provide a 'total rating' or scoring how well one can answer the given question based on the given context.
Rate your answer on an integer scale of 1 to 5, based on the following score rubric:

### Score Rubric:
Score 1: The question is not answerable at all given the context. The context does not provide any relevant information to answer the question.
Score 2: The question is barely answerable given the context. The context provides minimal relevant information, but the answer would be mostly incorrect, inaccurate, or not factual.
Score 3: The question is somewhat answerable given the context. The context provides some relevant information, but the answer would be partially correct, accurate, and factual.
Score 4: The question is mostly answerable given the context. The context provides sufficient relevant information, and the answer would be mostly correct, accurate, and factual.
Score 5: The q

### Use Judge to evaluate (i.e. label) the question-answer pairs based on the given document context:

In [None]:
from langchain_google_vertexai import ChatVertexAI

eval_llm = ChatOllama(model=ollama_chat[list(ollama_chat.keys())[1]], temperature=0.1)

outputs = eval_qa_couples(outputs, prompt_groundedness, prompt_relevancy, eval_llm)

Rating each QA couple according to groundedness and question relevancy...


  0%|          | 0/50 [00:00<?, ?it/s]

In [9]:
generated_questions = pd.DataFrame.from_dict(outputs)

In [10]:
print(generated_questions["groundedness_score"].value_counts(dropna=False))
print(generated_questions["question_relevancy_score"].value_counts(dropna=False))

groundedness_score
5.0    18
3.0    11
2.0    11
4.0     6
1.0     2
NaN     2
Name: count, dtype: int64
question_relevancy_score
5.0    18
3.0    12
4.0    12
NaN     6
2.0     2
Name: count, dtype: int64


In [11]:
generated_questions = pd.DataFrame.from_dict(outputs)
#generated_questions = pd.DataFrame.from_dict(outputs).dropna()
generated_questions = generated_questions.fillna({"groundedness_score": 1, 'question_relevancy_score': 1})

print(generated_questions["groundedness_score"].value_counts(dropna=False))
print(generated_questions["question_relevancy_score"].value_counts(dropna=False))

# generated_questions_good = generated_questions.loc[
#     (generated_questions["groundedness_score"] >= 4)
#     & (generated_questions["question_relevancy_score"] >= 4)
# ]

# print(generated_questions_good.shape)

# # Take the complement of the good questions
# generated_questions_bad = generated_questions.loc[
#     ~generated_questions.index.isin(generated_questions_good.index)
# ]

# print(generated_questions_bad.shape)

generated_questions[['question', "answer",'context', "groundedness_score"]].to_excel(f"{glob.DATA_PKG_DIR}/sentence_transformers_train.xlsx", index=False)
print(f"Generated questions saved to {glob.DATA_PKG_DIR}/sentence_transformers_train.xlsx")

groundedness_score
5.0    18
3.0    11
2.0    11
4.0     6
1.0     4
Name: count, dtype: int64
question_relevancy_score
5.0    18
3.0    12
4.0    12
1.0     6
2.0     2
Name: count, dtype: int64
Generated questions saved to /Users/avosseler/Github/Privat/llm-evaluation/data/sentence_transformers_train.xlsx


## Finetune embedding model based on your own corpus

Assume you have a RAG application that uses sentence-transformer embeddings (i.e. can use any relevant HuggingFace models on the hub)

### Partition the generated 'labeled' data set into train/test for model finetuning:

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, _, _ = train_test_split(generated_questions, generated_questions.groundedness_score, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(40, 9) (10, 9)


Dichotomize into good and bad matches of sentence pairs (since the labels lack of discriminatory power and variance):

In [13]:
X_train_good = X_train[(X_train.groundedness_score >= 4) & (X_train.question_relevancy_score >= 4)]
X_train_bad = X_train[(X_train.groundedness_score < 4) | (X_train.question_relevancy_score < 4)]

X_test_good = X_test[(X_test.groundedness_score >= 4) & (X_test.question_relevancy_score >= 4)]
X_test_bad = X_test[(X_test.groundedness_score < 4) | (X_test.question_relevancy_score < 4)]


questions_train_good = X_train_good.question.to_list()
context_train_good = X_train_good.context.to_list()
#score_train_good = X_train_good.groundedness_score.to_list()

questions_train_bad = X_train_bad.question.to_list()
context_train_bad = X_train_bad.context.to_list()

questions_test_good = X_test_good.question.to_list()
context_test_good = X_test_good.context.to_list()

questions_test_bad = X_test_bad.question.to_list()
context_test_bad = X_test_bad.context.to_list()

### Build your train/test data out of your custom dataset:

Expects as input two texts and a label of either 0 or 1 (Negatives/Positives). If the label == 1, then the distance between the two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.

In [None]:
from tqdm.auto import tqdm
from sentence_transformers import InputExample

train_examples = []
for sentence1, sentence2 in tqdm(zip(questions_train_good, context_train_good),total=len(context_train_good)):
    train_examples.append(InputExample(texts=[sentence1, sentence2], label=1))                 # positive examples
    # train_examples.append(InputExample(texts=[sentence1, sentence2]))

for sentence1, sentence2 in tqdm(zip(questions_train_bad, context_train_bad),total=len(context_train_bad)):
    train_examples.append(InputExample(texts=[sentence1, sentence2], label=0))                  # negative examples

print(len(train_examples))

test_examples = []
for sentence1, sentence2 in tqdm(zip(questions_test_good, context_test_good),total=len(context_test_good)):
    test_examples.append(InputExample(texts=[sentence1, sentence2], label=1))

for sentence1, sentence2 in tqdm(zip(questions_test_bad, context_test_bad),total=len(context_test_bad)):
    test_examples.append(InputExample(texts=[sentence1, sentence2], label=0))

print(len(test_examples))

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

40


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

10


In [15]:
import warnings
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentencesDataset
from sentence_transformers.losses import CosineSimilarityLoss, MultipleNegativesRankingLoss, ContrastiveLoss
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.evaluation import InformationRetrievalEvaluator

warnings.filterwarnings("ignore")

# Load a model to train/finetune
#model_name = "xlm-roberta-base"
model_name = "paraphrase-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=10)

test_dataset = SentencesDataset(test_examples, model)
test_dataloader = DataLoader(test_examples, shuffle=True, batch_size=10)

# # Define the evaluator
# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples, name='my_evaluator')

# # Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
# evaluator = InformationRetrievalEvaluator(
#     queries=questions_test,
#     corpus=corpus,
#     relevant_docs=context_test,
#     name="BeIR-touche2020-subset-test",
# )

# Initialize loss function
#loss = CosineSimilarityLoss(model=model)
# loss = MultipleNegativesRankingLoss(model=model)
loss = ContrastiveLoss(model=model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, loss)],
    epochs=100,
    warmup_steps=100,
    # evaluator=evaluator, 
    # evaluation_steps=500,
    output_path=os.path.join(glob.DATA_PKG_DIR,"fine-tuned-model")
)

# # Save the model
# model.save("tuned_models/fine_tuned_model")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/400 [00:00<?, ?it/s]

{'train_runtime': 28.9172, 'train_samples_per_second': 138.326, 'train_steps_per_second': 13.833, 'train_loss': 0.010766148567199707, 'epoch': 100.0}


### Test your fine-tuned embeddings for semantic search queries:

In [16]:
from sentence_transformers import SentenceTransformer, util

# Load the original and fine-tuned models
original_model = SentenceTransformer(model_name)
fine_tuned_model = SentenceTransformer(os.path.join(glob.DATA_PKG_DIR,"fine-tuned-model"))

In [17]:
# Use the model to generate embeddings
sentences = [
        "What are the weekly working hours?",
        "What is the dress code?",
        "What is the vacation policy?",
        "What is the sick leave policy?",
        "What is the parental leave policy?",
        "What is the notice period?",
        "What is the probation period?",
    ]

for sentence in sentences:
    original_embedding = original_model.encode(sentence)
    fine_tuned_embedding = fine_tuned_model.encode(sentence)

    cosine_similarity = util.pytorch_cos_sim(original_embedding, fine_tuned_embedding)
    print(f"The original and fine-tuned embeddings for '{sentence}' have a similarity of {cosine_similarity}")


The original and fine-tuned embeddings for 'What are the weekly working hours?' have a similarity of tensor([[0.8558]])
The original and fine-tuned embeddings for 'What is the dress code?' have a similarity of tensor([[0.8725]])
The original and fine-tuned embeddings for 'What is the vacation policy?' have a similarity of tensor([[0.9005]])
The original and fine-tuned embeddings for 'What is the sick leave policy?' have a similarity of tensor([[0.8670]])
The original and fine-tuned embeddings for 'What is the parental leave policy?' have a similarity of tensor([[0.8277]])
The original and fine-tuned embeddings for 'What is the notice period?' have a similarity of tensor([[0.8509]])
The original and fine-tuned embeddings for 'What is the probation period?' have a similarity of tensor([[0.8947]])
