In [1]:
from openai import OpenAI
import pandas as pd
import os

In [2]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))

In [3]:
# Define your expected and actual outputs
expected_output = """The crocodilian digestive system is highly adapted to their lifestyle. Crocodilians are known to swallow stones, known as gastroliths , 
                     which help digest their prey. The crocodilian stomach is divided into two chambers. The first is powerful and muscular. 
                     The other stomach is the most acidic digestive system of any animal. It can digest mostly everything from their prey, including bones, 
                     feathers, and horns!. Final answer : two"""

before_fine_tuning = """The correct answer is: None!
                        Crocodiles, like all reptiles, have a simple, tubular stomach without any chambers. The stomach is a long, 
                        muscular tube that helps to mix and break down food, but it does not have separate chambers like the stomachs 
                        of mammals and some other animals."""

after_fine_tuning = """That's a clever question! However, I must clarify that crocodiles do not have stomachs in the classical sense. Instead, 
                       they have a digestive system that involves breaking down food in their mouth and then absorbing nutrients through their skin.
                       So, to answer your question, the stomach of a crocodile does not have any chambers."""

HybridRAG = """two_chambers"""

HybridRAG_fine_tuned = """Answer: Based on the provided knowledge, I can answer that the stomach of a crocodile has two chambers."""

In [4]:
EVALUATION_PROMPT_TEMPLATE = """
You will be given one actual output for the expected_output. Your task is to rate the actual output on one metric.
Please make sure you read and understand these instructions very carefully. 
Please keep this expected output open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{expected_output}

Actual Output:

{actual_output}

Evaluation Form (scores ONLY):

- {metric_name}
"""


# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
            Relevance(1-5) - selection of important content from the expected output. \
            The actual output should include only important information from the expected output. \
            Annotators were instructed to penalize expected output which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """ Coherence - the collective quality of all sentences in the actual output based on the expected output
"""

COHERENCE_SCORE_STEPS = """
        1. Read the expected output carefully and identify the main topic and key points.,
        2. Read the actual output and compare it to the expected output. Check if the actual output covers the main topic and key points of the expected output,and if it presents them in a clear and logical order.,
        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CORRECTNESS_SCORE_CRITERIA = """ Determine whether the actual output is factually correct based on the expected output.
"""

CORRECTNESS_SCORE_STEPS = """
       1. Read the actual output carefully,
       2. Compare the actual output to the expected output and identify the main points of the expected out,
       3. Assess how well the actual output the main points of the expected output, and how much irrelevant or redundant information it contains.,
       4. Assign a relevance score from 1 to 5.
"""

In [5]:
def highlight_max(s):
    is_max = s == s.max()
    return [
        "font-weight: bold" if v else ""  # No background, just bold the max value
        for v in is_max
    ]


def get_geval_score(
    criteria: str, steps: str, expected_output: str, actual_output: str, metric_name: str
):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        expected_output=expected_output,  # Correct placeholder
        actual_output=actual_output,      # Correct placeholder
        metric_name=metric_name,
    )
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content


evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Correctness": (CORRECTNESS_SCORE_CRITERIA, CORRECTNESS_SCORE_STEPS),
}

summaries = {"before_fine_tuning": before_fine_tuning, "after_fine_tuning": after_fine_tuning , "HybridRAG":HybridRAG , "HybridRAG_fine_tuned":HybridRAG_fine_tuned}

data = {"Evaluation Type": [], "Summary Type": [], "Score": []}

for eval_type, (criteria, steps) in evaluation_metrics.items():
    for summ_type, summary in summaries.items():
        data["Evaluation Type"].append(eval_type)
        data["Summary Type"].append(summ_type)
        result = get_geval_score(criteria, steps, expected_output, summary, eval_type)
        score_num = int(result.strip())
        data["Score"].append(score_num)

pivot_df = pd.DataFrame(data, index=None).pivot(
    index="Evaluation Type", columns="Summary Type", values="Score"
)
styled_pivot_df = pivot_df.style.apply(highlight_max, axis=1)
display(styled_pivot_df)

Summary Type,HybridRAG,HybridRAG_fine_tuned,after_fine_tuning,before_fine_tuning
Evaluation Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Coherence,5,5,1,1
Correctness,5,5,1,1
Relevance,5,5,1,1


In [6]:
from bert_score import BERTScorer
# Instantiate the BERTScorer object for English language
scorer = BERTScorer(lang="en")

# Calculate BERTScore for the summary 1 against the excerpt
# P1, R1, F1_1 represent Precision, Recall, and F1 Score respectively
P1, R1, F1_1 = scorer.score([before_fine_tuning], [expected_output])

# Calculate BERTScore for summary 2 against the excerpt
# P2, R2, F2_2 represent Precision, Recall, and F1 Score respectively
P2, R2, F2_2 = scorer.score([after_fine_tuning], [expected_output])

P3, R3, F2_3 = scorer.score([HybridRAG], [expected_output])

P4, R4, F2_4 = scorer.score([HybridRAG_fine_tuned], [expected_output])

print("before_fine_tuning F1 Score:", F1_1.tolist()[0])
print("after_fine_tuning 2 F1 Score:", F2_2.tolist()[0])
print("HybridRAG F1 Score:", F2_3.tolist()[0])
print("HybridRAG_fine_tuned F1 Score:", F2_4.tolist()[0])
print("before_fine_tuning Precision:", P1.tolist()[0])
print("after_fine_tuning  Precision:", P2.tolist()[0])
print("HybridRAG Precision:", P3.tolist()[0])
print("HybridRAG_fine_tuned Precision:", P4.tolist()[0])
print("before_fine_tuning Recall:", R1.tolist()[0])
print("after_fine_tuning 2 Recall:", R2.tolist()[0])
print("HybridRAG Recall:", R3.tolist()[0])
print("HybridRAG_fine_tuned Recall:", R4.tolist()[0])

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


before_fine_tuning F1 Score: 0.7981292009353638
after_fine_tuning 2 F1 Score: 0.814254105091095
HybridRAG F1 Score: 0.7817310094833374
HybridRAG_fine_tuned F1 Score: 0.8686626553535461
before_fine_tuning Precision: 0.7413856983184814
after_fine_tuning  Precision: 0.7742841839790344
HybridRAG Precision: 0.7801142930984497
HybridRAG_fine_tuned Precision: 0.8831270933151245
before_fine_tuning Recall: 0.8642784357070923
after_fine_tuning 2 Recall: 0.8585754036903381
HybridRAG Recall: 0.7833545207977295
HybridRAG_fine_tuned Recall: 0.8546644449234009


: 