In [1]:
!pip install nltk bert-score
!pip install rouge-metric

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: nltk, bert-score
Successfully installed bert-score-0.3.13 nltk-3.9.1
Collecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl.metadata (9.5 kB)
Downloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
Installing collected packages: rouge-metric
Successfully installed rouge-metric-1.0.1


In [None]:
import pandas as pd

# Load the saved CSV file
Biobart_generated_summaries = pd.read_csv("Biobart_soap_generated_summaries.csv")

# Verify the data
print(Biobart_generated_summaries.head(2))

In [None]:
print(Biobart_generated_summaries['input'].iloc[50])

In [None]:
print(Biobart_generated_summaries['generated_summary'].iloc[50])

In [None]:
from tqdm import tqdm
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score
from rouge_metric import PyRouge

def clean_text(text):
    """Clean and normalize text."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    return ' '.join(text.strip().lower().split())  # Lowercase, strip spaces, normalize.

def compute_bleu_scores(reference, candidate):
    """Compute BLEU-1 and BLEU-2 scores."""
    try:
        smoothing_function = SmoothingFunction().method1
        # Compute BLEU-1
        bleu1 = sentence_bleu(
            [reference.split()],
            candidate.split(),
            weights=(1.0, 0, 0, 0),  # Only unigrams
            smoothing_function=smoothing_function
        )
        # Compute BLEU-2
        bleu2 = sentence_bleu(
            [reference.split()],
            candidate.split(),
            weights=(0.5, 0.5, 0, 0),  # Unigrams and bigrams only
            smoothing_function=smoothing_function
        )
        return bleu1 * 100, bleu2 * 100  # Convert to percentages
    except Exception as e:
        print(f"BLEU Error: {e}")
        print(f"Reference: '{reference[:50]}...'")
        print(f"Candidate: '{candidate[:50]}...'")
        return 0.0, 0.0

def compute_rouge_l(reference, candidate):
    """Compute ROUGE-L score."""
    rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=False,
                    rouge_w_weight=1.2, rouge_s=False, rouge_su=False, skip_gap=4)
    try:
        scores = rouge.evaluate([candidate], [[reference]])
        return scores['rouge-l']['f'] * 100  # Convert to percentage
    except Exception as e:
        print(f"ROUGE-L Error: {e}")
        print(f"Reference: '{reference[:50]}...'")
        print(f"Candidate: '{candidate[:50]}...'")
        return 0.0

def compute_bert_score_batched(references, candidates, batch_size=32):
    """Compute BERTScore in batches."""
    all_P, all_R, all_F1 = [], [], []
    for i in range(0, len(references), batch_size):
        batch_refs = references[i:i + batch_size]
        batch_cands = candidates[i:i + batch_size]
        try:
            P, R, F1 = score(batch_cands, batch_refs, lang="en", verbose=False)
            all_P.extend([p * 100 for p in P.tolist()])  # Convert to percentages
            all_R.extend([r * 100 for r in R.tolist()])  # Convert to percentages
            all_F1.extend([f * 100 for f in F1.tolist()])  # Convert to percentages
        except Exception as e:
            print(f"BERTScore Error in batch {i}: {e}")
            batch_len = len(batch_refs)
            all_P.extend([0.0] * batch_len)
            all_R.extend([0.0] * batch_len)
            all_F1.extend([0.0] * batch_len)
    return all_P, all_R, all_F1

def evaluate_summaries(df):
    bleu1_scores, bleu2_scores, rouge_l_scores = [], [], []
    print("Computing BLEU and ROUGE-L scores...")
    
    with tqdm(total=len(df), desc="Processing Rows", unit="row") as pbar:
        for _, row in df.iterrows():
            reference = clean_text(row['output'])
            candidate = clean_text(row['generated_summary'])  # Updated field name
            
            if not reference or not candidate:
                print(f"Empty text - Reference: '{reference}', Candidate: '{candidate}'")
                bleu1_scores.append(0.0)
                bleu2_scores.append(0.0)
                rouge_l_scores.append(0.0)
            else:
                bleu1, bleu2 = compute_bleu_scores(reference, candidate)
                bleu1_scores.append(bleu1)
                bleu2_scores.append(bleu2)
                rouge_l_scores.append(compute_rouge_l(reference, candidate))
            
            pbar.update(1)
    
    print("\nComputing BERTScore...")
    references = [clean_text(text) for text in df['output'].tolist()]
    candidates = [clean_text(text) for text in df['generated_summary'].tolist()]
    bert_p, bert_r, bert_f1 = compute_bert_score_batched(references, candidates)
    
    # Add all scores to DataFrame
    df['bleu1'] = bleu1_scores
    df['bleu2'] = bleu2_scores
    df['rouge_l'] = rouge_l_scores
    df['bert_p'] = bert_p
    df['bert_r'] = bert_r
    df['bert_f1'] = bert_f1
    
    # Print evaluation metrics
    print("\nEvaluation Metrics (in percentages):")
    print("Average BLEU-1:", df['bleu1'].mean(), "%")
    print("Average BLEU-2:", df['bleu2'].mean(), "%")
    print("Average ROUGE-L:", df['rouge_l'].mean(), "%")
    print("Average BERT P:", df['bert_p'].mean(), "%")
    print("Average BERT R:", df['bert_r'].mean(), "%")
    print("Average BERT F1:", df['bert_f1'].mean(), "%")
    
    # Print standard deviations
    print("\nStandard Deviations (in percentages):")
    print("BLEU-1 Std:", df['bleu1'].std(), "%")
    print("BLEU-2 Std:", df['bleu2'].std(), "%")
    print("ROUGE-L Std:", df['rouge_l'].std(), "%")
    print("BERT F1 Std:", df['bert_f1'].std(), "%")
    print("BERT P Std:", df['bert_p'].std(), "%")
    print("BERT R Std:", df['bert_r'].std(), "%")
    
    
    return df

# Updated DataFrame Name
Biobart_generated_summaries = evaluate_summaries(Biobart_generated_summaries)
Biobart_generated_summaries.to_csv("Biobart_evaluation_results.csv", index=False)
print("\nResults saved to 'Biobart_evaluation_results.csv'")
