In [7]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables and initialize clients
load_dotenv()
OpenAI_key = os.getenv("OPENAI_API_KEY")  
Huggingface_key = os.getenv("HUGGINGFACE_API_KEY")
client = OpenAI()

In [1]:
import pandas as pd
comparison_df = pd.read_csv('data/4_comparison_csv_data/comparison_data.csv')

comparison_df['companies_info'] = comparison_df.apply(
    lambda row: f"""{row['company_1']} and {row['company_2']} comparison in {row['category']}: \n\n
    {row['company_1']} information : \n\n {row['company_1_details']}\n\n
    {row['company_2']} information : \n\n {row['company_2_details']}""", axis=1) 

In [2]:
comparison_df

Unnamed: 0,company_1,company_2,category,company_1_details,company_2_details,comparison,companies_info
0,arag,axa,Reiseversicherung,Product Name: Top-Schutzbrief\nDetails: Bietet...,Product Name: Auslandskrankenversicherung\nDet...,Here's a comparison table in Markdown format f...,arag and axa comparison in Reiseversicherung: ...
1,arag,axa,Geschäftsinhaltsversicherung,Product Name: Inhaltsversicherung\nDetails: Sc...,Product Name: Inhaltsversicherung für Betriebe...,Here's a comparison table in Markdown format f...,arag and axa comparison in Geschäftsinhaltsver...
2,arag,axa,Elementarversicherung,Product Name: Elementarversicherung - Schutz v...,Product Name: Waldversicherung\nDetails: Versi...,Here's a comparison table in Markdown format f...,arag and axa comparison in Elementarversicheru...
3,arag,axa,Hausratversicherung,Product Name: Hausratversicherung\nDetails: Sc...,Product Name: Hausratversicherung von AXA\nDet...,Here's a comparison table in Markdown format f...,arag and axa comparison in Hausratversicherung...
4,arag,axa,Haftpflichtversicherung,Product Name: Haftpflichtversicherungen\nDetai...,Product Name: Veranstalterhaftpflichtversicher...,Below is a comparison table in Markdown format...,arag and axa comparison in Haftpflichtversiche...


In [4]:
# from trulens.core import TruSession
# from trulens.dashboard import run_dashboard

# session = TruSession()
# session.reset_database()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.
Database schema is ahead of the expected revision. Please update to a later release of `trulens`.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


In [20]:
golden_set = (
    comparison_df[['companies_info', 'comparison']]
    .rename(columns={'companies_info': 'query', 'comparison': 'expected_response'})
    .to_dict("records")
)

In [27]:
from trulens.providers.huggingface import Huggingface
from trulens.providers.openai import OpenAI
from trulens.feedback import GroundTruthAgreement


hug_provider = Huggingface() # for groundedness_measure_with_nli 
provider = OpenAI(model_engine="gpt-4o")
ground_truth_collection = GroundTruthAgreement(golden_set, provider=provider)
gta = GroundTruthAgreement(golden_set, provider=provider)

similarity_score_list = []
comprehensiveness_score_list = []
groundedness_score_list = []
company_1_list = []
company_2_list = []
category_list = []

for i in range(len(comparison_df)):
    try:
        similarity_score = gta.agreement_measure(
            prompt=comparison_df['companies_info'][i],
            response=comparison_df['comparison'][i]
        )[0]
    except Exception as e:
        print(f"An error occurred while processing similarity score at row {i}: {e}")
        similarity_score = 0

    try:
        comprehensiveness_score = provider.comprehensiveness_with_cot_reasons( # https://www.trulens.org/cookbook/use_cases/summarization_eval/#write-feedback-functions 
            source=comparison_df['companies_info'][i], # The source that should support the statement ( comapnies information )
            summary=comparison_df['comparison'][i], # Statement ( comparison result ) 
        )[0]
    except Exception as e:
        print(f"An error occurred while processing comprehensiveness score at row {i}: {e}")
        comprehensiveness_score = 0
        
    try:
        groundedness_score = provider.groundedness_measure_with_cot_reasons( # https://www.trulens.org/cookbook/use_cases/summarization_eval/#write-feedback-functions 
            source=comparison_df['companies_info'][i], # The source that should support the statement ( comapnies information )
            statement=comparison_df['comparison'][i], # Statement ( comparison result ) 
        )[0]
    except Exception as e:
        print(f"An error occurred while processing groundedness score at row {i}: {e}")
        groundedness_score = 0

    print(f'comparison between {comparison_df["company_1"][i]} and {comparison_df["company_2"][i]} in {comparison_df["category"][i]}')
    print(f'similarity_score : {similarity_score}, comprehensiveness_score : {comprehensiveness_score}, groundedness_score : {groundedness_score}')

    similarity_score_list.append(similarity_score)
    comprehensiveness_score_list.append(comprehensiveness_score)
    groundedness_score_list.append(groundedness_score)
    company_1_list.append(comparison_df['company_1'][i])
    company_2_list.append(comparison_df['company_2'][i])
    category_list.append(comparison_df['category'][i])

result_df = pd.DataFrame({
        'similarity_score': similarity_score_list,
        'comprehensiveness_score': comprehensiveness_score_list,
        'groundedness_score': groundedness_score_list,
        'company_1': company_1_list,
        'company_2': company_2_list,
        'category': category_list
})

comparison between arag and axa in Reiseversicherung
similarity_score : 1.0, comprehensiveness_score : 0.3333333333333333, groundedness_score : 0.0
comparison between arag and axa in Geschäftsinhaltsversicherung
similarity_score : 1.0, comprehensiveness_score : 0.8333333333333334, groundedness_score : 0.7368421052631579
comparison between arag and axa in Elementarversicherung
similarity_score : 1.0, comprehensiveness_score : 0.9583333333333334, groundedness_score : 0.0
comparison between arag and axa in Hausratversicherung
similarity_score : 1.0, comprehensiveness_score : 0.6, groundedness_score : 0.6666666666666666
comparison between arag and axa in Haftpflichtversicherung
similarity_score : 1.0, comprehensiveness_score : 0.49999999999999994, groundedness_score : 0.36585365853658536


In [None]:
import matplotlib.pyplot as plt

mean_scores = result_df[['similarity_scores', 'comprehensiveness_scores', 'groundedness_scores', 'category']].groupby('category').mean()

# Create a bar plot for each score
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 15))

mean_scores['similarity_scores'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Mean Similarity Score by Category')
axes[0].set_ylabel('Mean Similarity Score')

mean_scores['comprehensiveness_scores'].plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Mean Comprehensiveness Score by Category')
axes[1].set_ylabel('Mean Comprehensiveness Score')

mean_scores['groundedness_scores'].plot(kind='bar', ax=axes[2], color='salmon')
axes[2].set_title('Mean Groundedness Score by Category')
axes[2].set_ylabel('Mean Groundedness Score')

plt.tight_layout()

plot_writer = MatplotlibWriter(filepath="data/_poster_draft/7_output_plots/output_plot.png")
plt.close()
plot_writer.save(fig)


In [28]:
result_df

Unnamed: 0,similarity_score,comprehensiveness_score,groundedness_score,company_1,company_2,category
0,1.0,0.333333,0.0,arag,axa,Reiseversicherung
1,1.0,0.833333,0.736842,arag,axa,Geschäftsinhaltsversicherung
2,1.0,0.958333,0.0,arag,axa,Elementarversicherung
3,1.0,0.6,0.666667,arag,axa,Hausratversicherung
4,1.0,0.5,0.365854,arag,axa,Haftpflichtversicherung


In [None]:
# poetry add trulens trulens-providers-openai trulens-providers-huggingface bert_score evaluate absl-py rouge-score pandas 