In [12]:
import pandas as pd
import numpy as np
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [10]:
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/laksh.jain/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/laksh.jain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
base_df = pd.read_csv("base_model_response.csv")
tuned_df = pd.read_csv("tuned_model_response.csv")

In [4]:
base_df

Unnamed: 0,image,description,model_output
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A beautiful woman holding a tennis racquet sta...,A blind person is guiding a blind person thro...
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"A snowboarder stands in an ice-covered area, h...","In a picturesque winter landscape, a blind pe..."
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"A left hand holds an iced donut in a bag, its ...",A person is holding a glazed donut in their h...
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A baby is eating in a room of the house. She t...,"The boy is sitting in a crib, his head tilted..."
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A group of older men enjoy a meal at a wooden ...,"In this image, we see a group of people sitti..."
...,...,...,...
145,<PIL.JpegImagePlugin.JpegImageFile image mode=...,She holds a camera near a large sink and count...,"In a kitchen, a man is taking a picture of th..."
146,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A group of people travel down a street on moto...,"In the bustling street of a bustling city, a ..."
147,<PIL.JpegImagePlugin.JpegImageFile image mode=...,An umbrella sits on top of a sandy beach near ...,"The image depicts a serene beach scene, chara..."
148,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A food worker in a blue apron holds a cell pho...,A blind person is using a bright light to see...


In [17]:
tuned_df

Unnamed: 0,image,description,model_output
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A beautiful woman holding a tennis racquet sta...,A woman is playing tennis on a court. She is ...
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"A snowboarder stands in an ice-covered area, h...","A person is standing on a snowy hill, wearing..."
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"A left hand holds an iced donut in a bag, its ...",A person is holding a chocolate éclair in the...
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A baby is eating in a room of the house. She t...,A boy is eating pizza in a crib. He is holdin...
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A group of older men enjoy a meal at a wooden ...,A man is drinking a beer from a glass held in...
...,...,...,...
145,<PIL.JpegImagePlugin.JpegImageFile image mode=...,She holds a camera near a large sink and count...,A kitchen sink is being washed by a person wh...
146,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A group of people travel down a street on moto...,"A busy street scene is captured in a vivid, p..."
147,<PIL.JpegImagePlugin.JpegImageFile image mode=...,An umbrella sits on top of a sandy beach near ...,"A person is standing on the beach, holding a ..."
148,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A food worker in a blue apron holds a cell pho...,A blind person looks at the camera through a ...


In [18]:
def compute_meteor_scores(df):
    scores = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing METEOR scores"):
        ref = word_tokenize(row['description'])
        cand = word_tokenize(row['model_output'])
        scores.append(meteor_score([ref], cand))
    return sum(scores) / len(scores)

In [19]:
base_meteor = compute_meteor_scores(base_df)
tuned_meteor = compute_meteor_scores(tuned_df)

Computing METEOR scores:   0%|          | 0/150 [00:00<?, ?it/s]

Computing METEOR scores: 100%|██████████| 150/150 [00:00<00:00, 192.86it/s]
Computing METEOR scores: 100%|██████████| 150/150 [00:00<00:00, 311.45it/s]


In [20]:
print("Evaluation Results:")
print(f"Base Model METEOR Score: {base_meteor:.4f}")
print(f"Tuned Model METEOR Score: {tuned_meteor:.4f}")

Evaluation Results:
Base Model METEOR Score: 0.2521
Tuned Model METEOR Score: 0.2441


In [24]:
from evaluate import load
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
def compute_rouge_scores(df):
    rouge = evaluate.load('rouge')
    candidates = df['model_output'].tolist()
    references = df['description'].tolist()
    results = rouge.compute(predictions=candidates, references=references)
    return results

In [28]:
base_rouge = compute_rouge_scores(base_df)
tuned_rouge = compute_rouge_scores(tuned_df)

Downloading builder script: 6.27kB [00:00, 7.69MB/s]


ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['absl', 'rouge_score'] using 'pip install # Here to have a nice missing dependency error message early on rouge_score' for instance'

In [None]:
print("ROUGE SCORES")
print(f"Base Model ROUGE-1: {base_rouge['rouge1']:.4f}")
print(f"Tuned Model ROUGE-1: {tuned_rouge['rouge1']:.4f}")

print(f"Base Model ROUGE-2: {base_rouge['rouge2']:.4f}")
print(f"Tuned Model ROUGE-2: {tuned_rouge['rouge2']:.4f}")

print(f"Base Model ROUGE-L: {base_rouge['rougeL']:.4f}")
print(f"Tuned Model ROUGE-L: {tuned_rouge['rougeL']:.4f}")

print(f"Base Model ROUGE-Lsum: {base_rouge['rougeLsum']:.4f}")
print(f"Tuned Model ROUGE-Lsum: {tuned_rouge['rougeLsum']:.4f}")
