In [1]:
!pip install -q nltk bert-score
!pip install -q rouge-metric

In [None]:
import pandas as pd

# Load the saved CSV file
generated_summaries_soap_complete = pd.read_csv("summarized_output.csv")

# Verify the data
print(generated_summaries_soap_complete.head())

In [None]:
import pandas as pd
import time
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score
from rouge_metric import PyRouge
import numpy as np

# -----------------------------------------------------
# 1. Text Cleaning
# -----------------------------------------------------
def clean_text(text):
    """Clean and normalize text."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    return ' '.join(text.strip().lower().split())

# -----------------------------------------------------
# 2. BLEU Computation
# -----------------------------------------------------
def compute_bleu_scores(reference, candidate):
    smoothing_function = SmoothingFunction().method1
    bleu1 = sentence_bleu([reference.split()], candidate.split(), weights=(1.0, 0, 0, 0), smoothing_function=smoothing_function)
    bleu2 = sentence_bleu([reference.split()], candidate.split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)
    return bleu1 * 100, bleu2 * 100

# -----------------------------------------------------
# 3. ROUGE-L Computation
# -----------------------------------------------------
def compute_rouge_l(reference, candidate):
    rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, skip_gap=4)
    scores = rouge.evaluate([candidate], [[reference]])
    return scores['rouge-l']['f'] * 100

# -----------------------------------------------------
# 4. BERTScore in Batches
# -----------------------------------------------------
def compute_bert_score_batched(references, candidates, batch_size=32):
    all_P, all_R, all_F1 = [], [], []
    for i in range(0, len(references), batch_size):
        batch_refs = references[i:i + batch_size]
        batch_cands = candidates[i:i + batch_size]
        P, R, F1 = score(batch_cands, batch_refs, lang="en", verbose=False)
        all_P.extend([p * 100 for p in P.tolist()])
        all_R.extend([r * 100 for r in R.tolist()])
        all_F1.extend([f * 100 for f in F1.tolist()])
    return all_P, all_R, all_F1

# -----------------------------------------------------
# 5. Main Evaluation Function
# -----------------------------------------------------
def evaluate_summaries(df):
    bleu1_scores, bleu2_scores, rouge_l_scores = [], [], []
    print("Computing BLEU and ROUGE-L scores...")

    with tqdm(total=len(df), desc="Processing Rows", unit="row") as pbar:
        for _, row in df.iterrows():
            reference = clean_text(row['output'])
            candidate = clean_text(row['generated_summary'])

            if not reference or not candidate:
                print(f"Empty text - Reference: '{reference}', Candidate: '{candidate}'")
                bleu1_scores.append(0.0)
                bleu2_scores.append(0.0)
                rouge_l_scores.append(0.0)
            else:
                bleu1, bleu2 = compute_bleu_scores(reference, candidate)
                bleu1_scores.append(bleu1)
                bleu2_scores.append(bleu2)
                rouge_l_scores.append(compute_rouge_l(reference, candidate))

            pbar.update(1)

    print("\nComputing BERTScore...")
    references = [clean_text(text) for text in df['output'].tolist()]
    candidates = [clean_text(text) for text in df['generated_summary'].tolist()]

    bert_p, bert_r, bert_f1 = compute_bert_score_batched(references, candidates)

    df['bleu1'] = bleu1_scores
    df['bleu2'] = bleu2_scores
    df['rouge_l'] = rouge_l_scores
    df['bert_p'] = bert_p
    df['bert_r'] = bert_r
    df['bert_f1'] = bert_f1

    print("\nEvaluation Metrics (in percentages):")
    print("Average BLEU-1:", np.mean(bleu1_scores), "%")
    print("Average BLEU-2:", np.mean(bleu2_scores), "%")
    print("Average ROUGE-L:", np.mean(rouge_l_scores), "%")
    print("Average BERT P:", np.mean(bert_p), "%")
    print("Average BERT R:", np.mean(bert_r), "%")
    print("Average BERT F1:", np.mean(bert_f1), "%")

    print("\nStandard Deviations (in percentages):")
    print("BLEU-1 Std:", np.std(bleu1_scores), "%")
    print("BLEU-2 Std:", np.std(bleu2_scores), "%")
    print("ROUGE-L Std:", np.std(rouge_l_scores), "%")
    print("BERT P Std:", np.std(bert_p), "%")
    print("BERT R Std:", np.std(bert_r), "%")
    print("BERT F1 Std:", np.std(bert_f1), "%")

    return df

# -----------------------------------------------------
# 6. Load CSV & Evaluate
# -----------------------------------------------------
if __name__ == "__main__":
    # Load the CSV with 'output' and 'generated_summary'
    df_summaries = pd.read_csv("summarized_output.csv")
    print(df_summaries.head())

    # Evaluate
    results_df = evaluate_summaries(df_summaries)

    # Save the DataFrame with the metrics
    results_df.to_csv("soap_evaluation_results.csv", index=False)
    print("\nResults saved to 'soap_evaluation_results.csv'")
