In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv("data/results-gpt4o-mini.csv")
# using the first 300 documents
df = df.iloc[:300]

In [3]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


## Q1: What's the first value of the resulting vector

In [6]:
# get the embedding model
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [12]:
# Create the embeddings for the first LLM answer
answer_llm = df.iloc[0].answer_llm
vector = embedding_model.encode(answer_llm)
vector[0]

-0.42244655

## Q2: What's the 75% percentile of the score

In [26]:
evaluations = []

for idx,row in tqdm(df.iterrows()):
    llm_vector = embedding_model.encode(row["answer_llm"])
    orig_vector = embedding_model.encode(row['answer_orig'])
    evaluations.append(llm_vector.dot(orig_vector))

300it [02:19,  2.14it/s]


In [27]:
df["evaluations"]= evaluations
df["evaluations"].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547924
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: evaluations, dtype: float64

## Q3: Normalize the embeddings, What's the 75% cosine in the scores?

In [30]:
def normalize(vector):
    norm = np.sqrt((vector*vector).sum())
    return vector /norm

In [32]:
# turn into dict to see if it is faster than df 
data_dict=df.to_dict(orient='records')

In [33]:
evaluations_norm = []

for record in tqdm(data_dict):
    llm_vector = embedding_model.encode(record["answer_llm"])
    orig_vector = embedding_model.encode(record['answer_orig'])
    llm_norm=normalize(llm_vector)
    orig_norm=normalize(orig_vector)
    evaluations_norm.append(llm_norm.dot(orig_norm))

100%|███████████████████████████████████████████████████████| 300/300 [02:20<00:00,  2.14it/s]


In [34]:
df["evaluations_norm"]= evaluations_norm
df["evaluations_norm"].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: evaluations_norm, dtype: float64

## Q4. What's the F score for rouge-1

ROUGE:
- This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.
- It can give a more nuanced view of text similarity than just cosine similarity alone.
- There are three scores: rouge-1, rouge-2 and rouge-l, and precision, recall and F1 score for each.
- rouge-1 - the overlap of unigrams,
- rouge-2 - bigrams,
- rouge-l - the longest common subsequence

In [40]:
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df['answer_llm'].iloc[10], df['answer_orig'].iloc[10])[0]

In [41]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5: compute the average between rouge-1, rouge-2 and rouge-l for the same record from Q4

In [56]:
# Initialize the averages dictionary with zeros
averages = {val: 0 for key in scores for val in scores[key]}

# Calculate the averages
for key in scores:
    for val in scores[key]:
        averages[val] += scores[key][val]

# If you need to calculate the average, make sure to divide by the number of scores
# Assuming scores is a nested dictionary and you want the average for each 'val'
for val in averages:
    count = sum(1 for key in scores if val in scores[key])
    averages[val] /= count if count != 0 else 1


In [57]:
averages

{'r': 0.35490035490035493, 'p': 0.35490035490035493, 'f': 0.35490034990035496}

## Q6: What's the aggerage rouge_2 across all the records?

In [58]:
rouge_1 = []
rouge_2 = []
rouge_l = []
for record in tqdm(data_dict):
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_1.append(scores['rouge-1']['f'])
    rouge_2.append(scores['rouge-2']['f'])
    rouge_l.append(scores['rouge-l']['f'])
    # rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3  

100%|███████████████████████████████████████| 300/300 [00:00<00:00, 348.00it/s]


In [59]:
df["rouge_1_f"]=rouge_1
df["rouge_2_f"]=rouge_2
df["rouge_l_f"]=rouge_l

df["rouge_2_f"].mean()

0.20696501983423318