In [1]:
# Step 1: Import Required Libraries
import os
import sys
import logging
import torch
import evaluate
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


  from pandas.core import (


In [2]:
# Step 2: Set Up Logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [3]:
# Step 3: Define ROUGE Evaluation Class
class RougeEvaluation:
    def __init__(self):
        self.rouge_metric = evaluate.load("rouge")
        
    def compute_rouge_metric(self, generated_summary, reference_summary):
        results = self.rouge_metric.compute(
            predictions=generated_summary,
            references=reference_summary,
            use_aggregator=True,
            use_stemmer=True,
        )
        return results

In [10]:
# Step 4: Define Evaluation Function
def evaluation_rouge(model, tokenizer, data, generation_config):
    # Ensure device compatibility
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Extract dialogue and reference summaries
    dialogues = data["dialogue"]
    human_summaries = data["summary"]

    # Generate summaries
    model_summaries = []
    prefix = "Summarize the following dialogue:\n###\n"
    suffix = "\n### Summary: "

    for idx, dialogue in enumerate(dialogues):
        input_text = prefix + dialogue + suffix
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, **generation_config)
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        model_summaries.append(output_text)

    # Log progress
    logger.info("Evaluating summaries...")

    # Compute ROUGE scores
    rouge_evaluator = RougeEvaluation()
    results = rouge_evaluator.compute_rouge_metric(model_summaries, human_summaries)

    # Calculate average length of generated summaries
    generated_lengths = [len(summary.split()) for summary in model_summaries]
    average_gen_len = sum(generated_lengths) / len(generated_lengths) if generated_lengths else 0
    results["gen_len"] = average_gen_len

    return results


In [11]:
import pandas as pd

def evaluate_models(model_names, data, generation_config):
    """
    Evaluate multiple models on the same dataset and return a DataFrame with ROUGE scores.
    
    Parameters:
        model_names (list): List of model names to evaluate.
        data (Dataset): HuggingFace Dataset object containing 'dialogue' and 'summary'.
        generation_config (dict): Configuration for text generation.

    Returns:
        pd.DataFrame: DataFrame containing ROUGE scores and average generated length for each model.
    """
    results_list = []

    for model_name in model_names:
        logger.info(f"Evaluating model: {model_name}")
        
        # Load model and tokenizer
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Run evaluation
        results = evaluation_rouge(model, tokenizer, data, generation_config)
        results["model_name"] = model_name  # Add model name to the results

        # Append results to list
        results_list.append(results)

    # Convert results to a DataFrame
    df_results = pd.DataFrame(results_list)
    df_results = df_results.set_index("model_name")  # Use model names as index
    
    return df_results


In [16]:
# List of models to evaluate
model_names = [
    "facebook/bart-large-cnn",
    "google/pegasus-xsum",
    "Mia2024/CS5100TextSummarization"
]


In [17]:
# Example dataset
data = Dataset.from_dict({
    "dialogue": [
        "Hello, how can I help you today? Sure, I can help you book a flight to New York.",
        "I would like to schedule a doctor's appointment. Is there availability tomorrow morning?"
    ],
    "summary": [
        "Customer requested help booking a flight to New York.",
        "User wants to schedule a doctor's appointment for tomorrow morning."
    ]
})


In [18]:
# Configuration for text generation
generation_config = {
    "max_length": 50,  # Maximum length of the generated summary
    "num_beams": 4,    # Use beam search
    "do_sample": False # Deterministic output
}


In [19]:
# Evaluate all models and store results in a DataFrame
results_df = evaluate_models(model_names, data, generation_config)
print(results_df)


12/05/2024 11:21:07 - INFO - __main__ - Evaluating model: facebook/bart-large-cnn
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
12/05/2024 11:21:20 - INFO - __main__ - Evaluating summaries...
12/05/2024 11:21:20 - INFO - absl - Using default tokenizer.
12/05/2024 11:21:20 - INFO - __main__ - Evaluating model: google/pegasus-xsum
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
12/05/2024 11:21:30 - INFO - __main__ - Evaluating summaries...
12/05/2024 11:21:30 - INFO - absl - Using default tokenizer.
12/05/2024 11:21:30 - INFO - __main__ - Evaluating model: Mia2024/CS5100TextSummarization
  warn("The installed 

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
12/05/2024 11:21:38 - INFO - __main__ - Evaluating summaries...
12/05/2024 11:21:38 - INFO - absl - Using default tokenizer.


                                   rouge1    rouge2    rougeL  rougeLsum  \
model_name                                                                 
facebook/bart-large-cnn          0.359091  0.305094  0.359091   0.359091   
google/pegasus-xsum              0.109788  0.000000  0.072751   0.072751   
Mia2024/CS5100TextSummarization  0.470186  0.368187  0.470186   0.470186   

                                 gen_len  
model_name                                
facebook/bart-large-cnn             34.0  
google/pegasus-xsum                 16.5  
Mia2024/CS5100TextSummarization     26.5  
