In [1]:
import pandas as pd

In [2]:
gpt_4_mini = pd.read_csv("../assets/04-monitoring/results-gpt4o-mini.csv")

In [3]:
gpt_4_mini.head(3)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp


**We will use only the first 300 documents**

In [4]:
gpt_4_mini = gpt_4_mini.iloc[:300]

# Q1. Getting the embeddings model
Now, get the embeddings model *multi-qa-mpnet-base-dot-v1* from the [Sentence Transformer library](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#model-overview).

Create the embeddings for the first LLM answer:

`answer_llm = df.iloc[0].answer_llm`

What's the first value of the resulting vector?

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [7]:
answer_llm = gpt_4_mini.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [8]:
embedding = embedding_model.encode(answer_llm)

In [9]:
embedding[0]

-0.4224469

# Q2. Computing the dot product
Now for each answer pair, let's create embeddings and compute dot product between them

We will put the results (scores) into the evaluations list

What's the 75% percentile of the score?

In [10]:
from tqdm import tqdm

In [11]:
def compute_similarity(model, record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [12]:
evaluations = []
gpt_4_mini_iter = gpt_4_mini.to_dict(orient='records')
for record in tqdm(gpt_4_mini_iter):
    sim = compute_similarity(embedding_model, record)
    evaluations.append(sim)

100%|██████████| 300/300 [01:31<00:00,  3.26it/s]


In [13]:
gpt_4_mini['cosine'] = evaluations

In [14]:
gpt_4_mini['cosine'].describe().loc["75%"]

31.674306869506836

# Q3. Computing the cosine
From Q2, we can see that the results are not within the [0, 1] range. It's because the vectors coming from this model are not normalized.

So we need to normalize them.

To do it, we

Compute the norm of a vector
Divide each element by this norm

In [15]:
import numpy as np

In [16]:
def normalize(vector):
    norm = np.sqrt((vector * vector).sum())
    return vector / norm

In [17]:
def compute_normalize_similarity(model, record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize(model.encode(answer_llm))
    v_orig = normalize(model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [18]:
evaluations_normalize = []
gpt_4_mini_iter = gpt_4_mini.to_dict(orient='records')
for record in tqdm(gpt_4_mini_iter):
    sim = compute_normalize_similarity(embedding_model, record)
    evaluations_normalize.append(sim)

100%|██████████| 300/300 [01:51<00:00,  2.68it/s]


In [19]:
gpt_4_mini['cosine_norm'] = evaluations_normalize

In [20]:
gpt_4_mini['cosine_norm'].describe()["75%"]

0.8362348228693008

# Q4. Rouge
Now we will explore an alternative metric - the ROUGE score.

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

Let's compute the ROUGE score between the answers at the index 10 of our dataframe (doc_id=5170565b)

There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

* `rouge-1` - the overlap of unigrams,
* `rouge-2` - bigrams,
* `rouge-l` - the longest common subsequence
  
What's the F score for `rouge-1`?

In [21]:
from rouge import Rouge

In [22]:
gpt_4_mini[gpt_4_mini["document"] == "5170565b"]

Unnamed: 0,answer_llm,answer_orig,document,question,course,cosine,cosine_norm
10,"Yes, all sessions are recorded, so if you miss...","Everything is recorded, so you won’t miss anyt...",5170565b,Are sessions recorded if I miss one?,machine-learning-zoomcamp,32.344715,0.777956
11,"Yes, you can ask your questions in advance if ...","Everything is recorded, so you won’t miss anyt...",5170565b,Can I ask questions in advance if I can't atte...,machine-learning-zoomcamp,31.441849,0.783566
12,"If you miss a session, don't worry! Everything...","Everything is recorded, so you won’t miss anyt...",5170565b,How will my questions be addressed if I miss a...,machine-learning-zoomcamp,36.380722,0.904688
13,"Yes, there is a way to catch up on a missed se...","Everything is recorded, so you won’t miss anyt...",5170565b,Is there a way to catch up on a missed session?,machine-learning-zoomcamp,33.340508,0.806303
14,"Yes, you can still interact with instructors a...","Everything is recorded, so you won’t miss anyt...",5170565b,Can I still interact with instructors after mi...,machine-learning-zoomcamp,30.606157,0.727596


In [23]:
record_10 = gpt_4_mini.iloc[10]

In [24]:
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(record_10['answer_llm'], record_10['answer_orig'])[0]

In [25]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [26]:
scores["rouge-1"]["f"]

0.45454544954545456

# Q5. Average rouge score
Let's compute the average F-score between `rouge-1`, `rouge-2` and `rouge-l` for the same record from Q4

In [27]:
np.asarray([metrics["f"] for metrics in scores.values()]).mean()

0.35490034990035496

# Q6. Average rouge score for all the data points
Now let's compute the score for all the records and create a dataframe from them.

What's the average rouge_2 across all the records?

In [28]:
def compute_rouge(rouge_scorer, record, score="rouge-2", metric="f"):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
    return scores[score][metric]

In [29]:
f1_rouge_2_scores = []
gpt_4_mini_iter = gpt_4_mini.to_dict(orient='records')
for record in tqdm(gpt_4_mini_iter):
    score = compute_rouge(rouge_scorer, record, score="rouge-2", metric="f")
    f1_rouge_2_scores.append(score)

100%|██████████| 300/300 [00:00<00:00, 552.54it/s]


In [30]:
np.asarray(f1_rouge_2_scores).mean() 

0.20696501983423318