In [1]:
from openai import OpenAI
import pandas as pd
import os

In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY_HERE"))

In [3]:
# Define your expected and actual outputs
expected_output = """42.1 Circulatory systems link exchange surfaces with cells throughout the body. Final Answer :circulatory"""

before_fine_tuning = """The organ systems that link exchange surfaces with cells throughout the body are:
                    Vascular System (Circulatory System): Consists of the heart, arteries, veins, and blood vessels that transport oxygenated blood to body tissues and coordinate the exchange of oxygen, carbon dioxide, and nutrient molecules between the blood and cells.
                    Respiratory System: Comprises the lungs, trachea, and bronchi that facilitate the exchange of oxygen and carbon dioxide through inhalation and exhalation, allowing cells to extract oxygen and eliminate carbon dioxide.
                    Urinary System (Excretory System): Includes the kidneys, ureters, bladder, and urethra that filter and eliminate waste products, excess ions, and water from the body, maintaining a stable environment for cells.
                    Integumentary System: Encompasses the skin, hair, nails, and associated glands that regulate body temperature, protect against external damage, and facilitate the exchange of materials."""

after_fine_tuning = """A. circulatory system
                    B. respiratory system
                    C. skeletal system
                    D. integumentary system

                    Answer: A. circulatory system
                    The circulatory system, also known as the cardiovascular system, is responsible for moving blood throughout the body. It links exchange surfaces with cells throughout the body. The main organs of the circulatory system are the heart, arteries, veins, and blood vessels. The heart pumps blood throughout the body. Arteries carry oxygenated blood away from the heart. Veins return deoxygenated blood to the heart. Blood vessels include both arteries and veins. The circulatory system is responsible for delivering oxygen and nutrients to cells and organs throughout the body. It also picks up waste products and returns them to the excretory system for elimination. In addition, the circulatory system plays a key role in maintaining body temperature, as it helps to distribute heat generated by metabolic processes throughout the body."""


In [4]:
# # Define your expected and actual outputs
# expected_output = """The metals of group 13 (Al, Ga, In, and Tl) are all reactive. However, 
#                     passivation occurs as a tough, hard, thin film of the metal oxide forms 
#                     upon exposure to air. Disruption of this film may counter the passivation, 
#                     allowing the metal to react. One way to disrupt the film is to expose the 
#                     passivated metal to mercury. Some of the metal dissolves in the mercury to 
#                     form an amalgam, which sheds the protective oxide layer to expose the metal 
#                     to further reaction. The formation of an amalgam allows the metal to react 
#                     with air and water. Final answer : air and water"""

# before_fine_tuning = """The correct answer is: mercury.
#                     When a metal reacts with mercury, it forms an amalgam. The reactants typically include a metal 
#                     (such as silver, gold, or tin) and mercury. During this reaction, the metal atoms lose electrons
#                     and form ions, which then combine with mercury atoms to form a new compound with a distinct color,
#                     texture, and properties."""

# after_fine_tuning = """A. Air
#                     B. Water
#                     C. Mercury
#                     D. The surrounding tooth material
                    
#                     Answer: D. The surrounding tooth material
#                     The formation of an amalgam allows the metal to react with the surrounding tooth material, which is 
#                     mostly dentin. The mercury in the amalgam reacts with the copper in the alloy to form a stable compound 
#                     that prevents further corrosion. The resulting amalgam is strong and durable, making it well suited for 
#                     use in dental restorations such as fillings and inlays. In addition, amalgams are also used in crowns, 
#                     bridges, and dentures."""

In [5]:
EVALUATION_PROMPT_TEMPLATE = """
You will be given one actual output for the expected_output. Your task is to rate the actual output on one metric.
Please make sure you read and understand these instructions very carefully. 
Please keep this expected output open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{expected_output}

Actual Output:

{actual_output}

Evaluation Form (scores ONLY):

- {metric_name}
"""


# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
            Relevance(1-5) - selection of important content from the expected output. \
            The actual output should include only important information from the expected output. \
            Annotators were instructed to penalize expected output which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """ Coherence - the collective quality of all sentences in the actual output based on the expected output
"""

COHERENCE_SCORE_STEPS = """
        1. Read the expected output carefully and identify the main topic and key points.,
        2. Read the actual output and compare it to the expected output. Check if the actual output covers the main topic and key points of the expected output,and if it presents them in a clear and logical order.,
        3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CORRECTNESS_SCORE_CRITERIA = """ Determine whether the actual output is factually correct based on the expected output.
"""

CORRECTNESS_SCORE_STEPS = """
       1. Read the actual output carefully,
       2. Compare the actual output to the expected output and identify the main points of the expected out,
       3. Assess how well the actual output the main points of the expected output, and how much irrelevant or redundant information it contains.,
       4. Assign a relevance score from 1 to 5.
"""

In [6]:
def highlight_max(s):
    is_max = s == s.max()
    return [
        "font-weight: bold" if v else ""  # No background, just bold the max value
        for v in is_max
    ]


def get_geval_score(
    criteria: str, steps: str, expected_output: str, actual_output: str, metric_name: str
):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        expected_output=expected_output,  # Correct placeholder
        actual_output=actual_output,      # Correct placeholder
        metric_name=metric_name,
    )
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content


evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Correctness": (CORRECTNESS_SCORE_CRITERIA, CORRECTNESS_SCORE_STEPS),
}

summaries = {"before_fine_tuning": before_fine_tuning, "after_fine_tuning": after_fine_tuning}

data = {"Evaluation Type": [], "Summary Type": [], "Score": []}

for eval_type, (criteria, steps) in evaluation_metrics.items():
    for summ_type, summary in summaries.items():
        data["Evaluation Type"].append(eval_type)
        data["Summary Type"].append(summ_type)
        result = get_geval_score(criteria, steps, expected_output, summary, eval_type)
        score_num = int(result.strip())
        data["Score"].append(score_num)

pivot_df = pd.DataFrame(data, index=None).pivot(
    index="Evaluation Type", columns="Summary Type", values="Score"
)
styled_pivot_df = pivot_df.style.apply(highlight_max, axis=1)
display(styled_pivot_df)

Summary Type,after_fine_tuning,before_fine_tuning
Evaluation Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Coherence,5,5
Correctness,5,5
Relevance,5,2


In [7]:
from rouge import Rouge
import pandas as pd

# function to calculate the Rouge score
def get_rouge_scores(text1, text2):
    rouge = Rouge()
    return rouge.get_scores(text1, text2)


# Calculate the ROUGE scores for both summaries using reference
eval_1_rouge = get_rouge_scores(before_fine_tuning, expected_output)
eval_2_rouge = get_rouge_scores(after_fine_tuning, expected_output)

# Create a list to store the scores
rouge_scores_out = []

# Extract and store the scores
for metric in ["rouge-1", "rouge-2", "rouge-l"]:
    for label in ["f"]:
        eval_1_score = eval_1_rouge[0][metric][label]
        eval_2_score = eval_2_rouge[0][metric][label]

        row = {
            "Metric": f"{metric} (F-Score)",
            "before_fine_tuning": eval_1_score,
            "after_fine_tuning": eval_2_score,
        }
        rouge_scores_out.append(row)

# Convert the results to a DataFrame and style it
rouge_scores_out = pd.DataFrame(rouge_scores_out).set_index("Metric")
rouge_scores_out_styled = rouge_scores_out.style.apply(highlight_max, axis=1)

rouge_scores_out_styled


Unnamed: 0_level_0,before_fine_tuning,after_fine_tuning
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
rouge-1 (F-Score),0.18,0.142857
rouge-2 (F-Score),0.101449,0.088235
rouge-l (F-Score),0.18,0.142857


In [9]:
from bert_score import BERTScorer
# Instantiate the BERTScorer object for English language
scorer = BERTScorer(lang="en")

# Calculate BERTScore for the summary 1 agai
# P1, R1, F1_1 represent Precision, Recall, and F1 Score respectively
P1, R1, F1_1 = scorer.score([before_fine_tuning], [expected_output])

# Calculate BERTScore for summary 2 against the excerpt
# P2, R2, F2_2 represent Precision, Recall, and F1 Score respectively
P2, R2, F2_2 = scorer.score([after_fine_tuning], [expected_output])

print("before_fine_tuning F1 Score:", F1_1.tolist()[0])
print("after_fine_tuning 2 F1 Score:", F2_2.tolist()[0])
print("before_fine_tuning Precision:", P1.tolist()[0])
print("after_fine_tuning 2 Precision:", P2.tolist()[0])
print("before_fine_tuning Recall:", R1.tolist()[0])
print("after_fine_tuning 2 Recall:", R2.tolist()[0])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


before_fine_tuning F1 Score: 0.8097435235977173
after_fine_tuning 2 F1 Score: 0.8154077529907227
before_fine_tuning Precision: 0.7407108545303345
after_fine_tuning 2 Precision: 0.7454288005828857
before_fine_tuning Recall: 0.8929659724235535
after_fine_tuning 2 Recall: 0.8998866677284241
