In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from rouge import Rouge
from tqdm.auto import tqdm

# Load the dataset
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

# Load the embeddings model
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

# Q1: Create embeddings for the first LLM answer and get the first value of the resulting vector
answer_llm = df.iloc[0].answer_llm
embedding = embedding_model.encode(answer_llm)
first_value = embedding[0]
print("Q1:", first_value)

# Q2: Compute the dot product for each answer pair and determine the 75% percentile of the score
evaluations = []
for i in tqdm(range(len(df)), desc="Computing dot products"):
    answer1 = df.iloc[i].answer_llm
    embedding1 = embedding_model.encode(answer1)
    for j in range(i+1, len(df)):
        answer2 = df.iloc[j].answer_llm
        embedding2 = embedding_model.encode(answer2)
        score = np.dot(embedding1, embedding2)
        evaluations.append(score)

percentile_75 = np.percentile(evaluations, 75)
print("Q2:", percentile_75)

# Q3: Normalize the vectors and compute the cosine similarity, then determine the 75% percentile of the scores
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

cosine_scores = []
for i in tqdm(range(len(df)), desc="Computing cosine similarities"):
    answer1 = df.iloc[i].answer_llm
    embedding1 = normalize_vector(embedding_model.encode(answer1))
    for j in range(i+1, len(df)):
        answer2 = df.iloc[j].answer_llm
        embedding2 = normalize_vector(embedding_model.encode(answer2))
        cosine_score = np.dot(embedding1, embedding2)
        cosine_scores.append(cosine_score)

percentile_75_cosine = np.percentile(cosine_scores, 75)
print("Q3:", percentile_75_cosine)

# Q4: Compute the ROUGE score for specific documents and determine the F1 score for rouge-1
rouge_scorer = Rouge()
index = 10
r = df.iloc[index]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
f_score_rouge_1 = scores['rouge-1']['f']
print("Q4:", f_score_rouge_1)

# Q5: Compute the average ROUGE score for the same record
average_rouge = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f']) / 3
print("Q5:", average_rouge)

# Q6: Compute the average ROUGE score for all data points
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for i in tqdm(range(len(df)), desc="Computing average ROUGE scores"):
    r = df.iloc[i]
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    rouge_1_scores.append(scores['rouge-1']['f'])
    rouge_2_scores.append(scores['rouge-2']['f'])
    rouge_l_scores.append(scores['rouge-l']['f'])

average_rouge_2 = np.mean(rouge_2_scores)
print("Q6:", average_rouge_2)


You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Q1: -0.42244655


Computing dot products:   0%|          | 0/300 [00:00<?, ?it/s]

Q2: 12.758448123931885


Computing cosine similarities:   0%|          | 0/300 [00:00<?, ?it/s]

Q3: 0.3429045081138611
Q4: 0.45454544954545456
Q5: 0.35490034990035496


Computing average ROUGE scores:   0%|          | 0/300 [00:00<?, ?it/s]

Q6: 0.20696501983423318


In [2]:
!pip install rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
