In [1]:
# pip install -q scikit-learn litellm


In [1]:
# Import groundedness feedback function
from trulens_eval import Tru
from test_cases import generate_ms_marco_context_relevance_benchmark
from benchmark_frameworks.eval_as_recommendation import score_passages, compute_ndcg, compute_ece, recall_at_k, precision_at_k
Tru().reset_database()

benchmark_data = []
for i in range(1, 6):
    dataset_path = f"./datasets/ms_marco/ms_marco_train_v2.1_{i}.json"
    benchmark_data.extend(list(generate_ms_marco_context_relevance_benchmark(dataset_path)))


🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [2]:
import os
os.environ["OPENAI_API_KEY"] = "..."
os.environ["HUGGINGFACE_API_KEY"] = "..."
os.environ["ANTHROPIC_API_KEY"] = "..."
os.environ["TOGETHERAI_API_KEY"] = "..."

In [3]:
import pandas as pd
import numpy as np
df = pd.DataFrame(benchmark_data)


print(len(df.groupby("query_id").count()))

305


In [4]:
df.groupby("query_id").head()

Unnamed: 0,query_id,query,passage,is_selected,relevant_idx
0,1185869,)what was the immediate impact of the success ...,The presence of communication amid scientific ...,1,0
1,1185869,)what was the immediate impact of the success ...,The Manhattan Project and its atomic bomb help...,0,0
2,1185869,)what was the immediate impact of the success ...,Essay on The Manhattan Project - The Manhattan...,0,0
3,1185869,)what was the immediate impact of the success ...,The Manhattan Project was the name for a proje...,0,0
4,1185869,)what was the immediate impact of the success ...,versions of each volume as well as complementa...,0,0
...,...,...,...,...,...
3032,565901,what are some things you can do to keep your d...,Eating the right foods not only makes it easie...,0,9
3033,565901,what are some things you can do to keep your d...,Eat a healthy diet. Photo Credit Tay Jnr/Digit...,0,9
3034,565901,what are some things you can do to keep your d...,Share. Your digestive system is where it all b...,0,9
3035,565901,what are some things you can do to keep your d...,"Start Slideshow. For some of us, digestive dis...",0,9


#### Define feedback functions for contexnt relevance to be evaluated

In [15]:

# Running the benchmark
results = []

K = 5 # for recall@K

intermediate_results = []
for name, func in feedback_functions.items():
    try:
        score_confidence_pairs, _ = score_passages(df, name, func, backoffs_by_functions[name] if name in backoffs_by_functions else 0.5, n=1)
        ece_value = compute_ece(score_confidence_pairs[1], score_confidence_pairs[0])
        
        results.append((name, ece_value, ))
        print(f"Finished running feedback function name {name}")
    
        print("Saving results...")
        tmp_results_df = pd.DataFrame(results, columns=['Model',  'ECE'])
        print(tmp_results_df)
        intermediate_results.append(tmp_results_df)
    except Exception as e:
        print(f"Failed to run benchmark for feedback function name {name} due to {e}")
# Convert results to DataFrame for display
results_df = pd.DataFrame(results, columns=['Model', 'ECE',])


Failed to run benchmark for feedback function name GPT-3.5-Turbo due to LLMProvider.qs_relevance() got an unexpected keyword argument 'temperature'
Feedback avg score for query 677 is (0.7, 0.8), is_selected is 0
Feedback avg score for query 677 is (0.2, 0.7), is_selected is 0


KeyboardInterrupt: 

In [22]:
results_df

Unnamed: 0,Model,ECE
0,GPT-3.5-Turbo,0.21
1,GPT-4-Turbo,0.275
2,Claude-1,0.14
3,Claude-2,0.175


### Visualization

In [10]:
results_df.to_csv("results_verbalized_ece.csv")

In [None]:

import matplotlib.pyplot as plt


# Make sure results_df is defined and contains the necessary columns
# Also, ensure that K is defined

plt.figure(figsize=(12, 10))

# Graph for nDCG, Recall@K, and Precision@K
plt.subplot(2, 1, 1)  # First subplot
ax1 = results_df.plot(x='Model', y=['nDCG', f'Recall@{K}', 'Precision@1'], kind='bar', ax=plt.gca())
plt.title('Feedback Function Performance (Higher is Better)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(loc='upper left')

# Graph for ECE
plt.subplot(2, 1, 2)  # Second subplot
ax2 = results_df.plot(x='Model', y=['ECE'], kind='bar', ax=plt.gca(), color='orange')
plt.title('Feedback Function Calibration (Lower is Better)')
plt.ylabel('ECE')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()




### Temperature Scaling 

In [10]:
from trulens_eval.feedback import OpenAI, LiteLLM

temperatures = [0, 0.3, 0.7, 1]
# GPT 3.5
turbo = OpenAI(model_engine="gpt-3.5-turbo")

def wrapped_relevance_turbo_t(input, output, temperature):
    return turbo.qs_relevance_confidence_verb_2s_top1(input, output, temperature)
 
# # GPT 4 turbo
gpt4 = OpenAI(model_engine="gpt-4-1106-preview")

def wrapped_relevance_gpt4_t(input, output, temperature):
    return  gpt4.qs_relevance_confidence_verb_2s_top1(input, output, temperature)

claude_1 = LiteLLM(model_engine="claude-instant-1")
def wrapped_relevance_claude1_t(input, output, temperature):
     claude_1.qs_relevance_confidence_verb_2s_top1(input, output, temperature)

claude_2 = LiteLLM(model_engine="claude-2")
def wrapped_relevance_claude2_t(input, output, temperature):
    claude_2.qs_relevance_confidence_verb_2s_top1(input, output, temperature)

feedback_functions = {
    'GPT-3.5-Turbo': wrapped_relevance_turbo_t,
    'GPT-4-Turbo': wrapped_relevance_gpt4_t,
    'Claude-1': wrapped_relevance_claude1_t,
    'Claude-2': wrapped_relevance_claude2_t,
}

backoffs_by_functions = {
    'GPT-3.5-Turbo': 0,
    'GPT-4-Turbo': 0,
    'Claude-1': 1.5,
    'Claude-2': 1.5,
}

In [11]:
for temp in temperatures:
    # Running the benchmark
    results = []

    K = 5 # for recall@K

    intermediate_results = []
    for name, func in feedback_functions.items():
        try:
            scores, true_relevance = score_passages(df, name, func, backoffs_by_functions[name] if name in backoffs_by_functions else 0.5, n=1, temperature=temp)
            ece_value = compute_ece(scores, true_relevance)
            
            results.append((name, ece_value, ))
            print(f"Finished running feedback function name {name}")
        
            print("Saving results...")
            tmp_results_df = pd.DataFrame(results, columns=[f'Model-t-{temp}',  'ECE'])
            print(tmp_results_df)
            intermediate_results.append(tmp_results_df)
        except Exception as e:
            print(f"Failed to run benchmark for feedback function name {name} due to {e}")
    # Convert results to DataFrame for display
    results_df = pd.DataFrame(results, columns=[f'Model-t-{temp}', 'ECE',])


Feedback avg score for query 677 is 0.2, is_selected is 0
Feedback avg score for query 677 is 0.4, is_selected is 0
Feedback avg score for query 677 is 0.8, is_selected is 0
Feedback avg score for query 677 is 0.8, is_selected is 1
Feedback avg score for query 677 is 0.8, is_selected is 0
Feedback avg score for query 677 is 0.8, is_selected is 0
Feedback avg score for query 677 is 0.8, is_selected is 0
Feedback avg score for query 677 is 0.4, is_selected is 0
Feedback avg score for query 677 is 0.4, is_selected is 0
Feedback avg score for query 677 is 0.5, is_selected is 0
Query 677 scored 10 out of 10 passages.
Feedback avg score for query 18268 is 0.4, is_selected is 0
Feedback avg score for query 18268 is 0.8, is_selected is 1
Feedback avg score for query 18268 is 0.8, is_selected is 0
Feedback avg score for query 18268 is 0.5, is_selected is 0
Feedback avg score for query 18268 is 0.8, is_selected is 0
Feedback avg score for query 18268 is 0.5, is_selected is 0
Feedback avg score f

KeyboardInterrupt: 