In [32]:
import pandas as pd
import requests
from anyio import sleep

llm_res = pd.read_csv('../gym/llm_response.csv')
altair_res = pd.read_csv('../gym/altair_response.csv')


In [33]:


def true_positives(llm, altair):
    return len(set(llm['id']).intersection(set(altair['id'])))

def false_positives(llm, altair):
    return len(altair) - true_positives(llm, altair)

def false_negatives(llm, altair):
    return len(llm) - true_positives(llm, altair)

def precision(tp, fp):
    return tp / (tp + fp)

def recall(tp, fn):
    return tp / (tp + fn)

def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall)


In [34]:
# F1 score using the relevance data

# F1 = 2 * (precision * recall) / (precision + recall)
# precision = tp / (tp + fp)
# recall = tp / (tp + fn)

# llm_res contains the ground truth relevance data
# altair_res contains the relevance data from the Altair API

scores = []

for i in range(1, 11):
    llm = llm_res.loc[(llm_res['query_id'] == i) & (llm_res['relevant'] == True)]
    altair = altair_res[altair_res['query_id'] == i]
    
    tp = true_positives(llm, altair)
    fp = false_positives(llm, altair)
    fn = false_negatives(llm, altair)
    
    p = precision(tp, fp)
    r = recall(tp, fn)
    
    f1 = f1_score(p, r)    
    scores.append({'query_id': i, 'f1': f1, 'precision': p, 'recall': r})


In [35]:
mean_f1 = sum([score['f1'] for score in scores]) / len(scores)
mean_precision = sum([score['precision'] for score in scores]) / len(scores)
mean_recall = sum([score['recall'] for score in scores]) / len(scores)

print(f'Mean F1 score: {mean_f1}')
print(f'Mean precision: {mean_precision}')
print(f'Mean recall: {mean_recall}')


Mean F1 score: 0.2145115346330196
Mean precision: 0.13105590062111802
Mean recall: 0.9738095238095237
