In [1]:
!pip install -q nltk bert-score
!pip install -q rouge-metric

In [6]:
import pandas as pd

# Load the saved CSV file
generated_summaries_soap = pd.read_csv("soap_generated_summaries.csv")

# Verify the data
print(generated_summaries_soap.head())

                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  \
0  Subjective:\n- Symptoms: Lower back pain, radi...   
1  Subjective:\n- Presenting with dry cough for 1...   
2  Subjective:\n- No known allergies to medicatio...   
3  Subjective:\n- Fever and dry cough started 4 d...   
4  Subjective:\n- Presenting with chest pain for ...   

                                   generated_summary  
0  The 75-year-old man has been experiencing lowe...  
1  , but as it went along, the smell started comi...  
2  The defendant is charged with murder in connec...  
3  that you could have contracted something, poss...  
4  into this further and try to find out what's g..

In [None]:
import pandas as pd
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score
from rouge_metric import PyRouge

# -----------------------------
# 1. Text Cleaning
# -----------------------------
def clean_text(text):
    """Clean and normalize text."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    # Lowercase, strip spaces, split, and rejoin to normalize whitespace
    return ' '.join(text.strip().lower().split())

# -----------------------------
# 2. BLEU Computation
# -----------------------------
def compute_bleu_scores(reference, candidate):
    """Compute BLEU-1 and BLEU-2 scores."""
    try:
        smoothing_function = SmoothingFunction().method1
        
        # Compute BLEU-1 (unigrams)
        bleu1 = sentence_bleu(
            [reference.split()],
            candidate.split(),
            weights=(1.0, 0, 0, 0),
            smoothing_function=smoothing_function
        )
        
        # Compute BLEU-2 (unigrams + bigrams)
        bleu2 = sentence_bleu(
            [reference.split()],
            candidate.split(),
            weights=(0.5, 0.5, 0, 0),
            smoothing_function=smoothing_function
        )
        
        # Convert to percentages
        return bleu1 * 100, bleu2 * 100
    except Exception as e:
        print(f"BLEU Error: {e}")
        print(f"Reference: '{reference[:50]}...'")
        print(f"Candidate: '{candidate[:50]}...'")
        return 0.0, 0.0

# -----------------------------
# 3. ROUGE-L Computation
# -----------------------------
def compute_rouge_l(reference, candidate):
    """Compute ROUGE-L score."""
    rouge = PyRouge(
        rouge_n=(1, 2),
        rouge_l=True,
        rouge_w=False,
        rouge_w_weight=1.2,
        rouge_s=False,
        rouge_su=False,
        skip_gap=4
    )
    try:
        scores = rouge.evaluate([candidate], [[reference]])
        return scores['rouge-l']['f'] * 100  # Convert to percentage
    except Exception as e:
        print(f"ROUGE-L Error: {e}")
        print(f"Reference: '{reference[:50]}...'")
        print(f"Candidate: '{candidate[:50]}...'")
        return 0.0

# -----------------------------
# 4. BERTScore in Batches
# -----------------------------
def compute_bert_score_batched(references, candidates, batch_size=32):
    """Compute BERTScore in batches."""
    all_P, all_R, all_F1 = [], [], []
    for i in range(0, len(references), batch_size):
        batch_refs = references[i:i + batch_size]
        batch_cands = candidates[i:i + batch_size]
        try:
            P, R, F1 = score(batch_cands, batch_refs, lang="en", verbose=False)
            # Convert to percentages
            all_P.extend([p * 100 for p in P.tolist()])
            all_R.extend([r * 100 for r in R.tolist()])
            all_F1.extend([f * 100 for f in F1.tolist()])
        except Exception as e:
            print(f"BERTScore Error in batch {i}: {e}")
            batch_len = len(batch_refs)
            all_P.extend([0.0] * batch_len)
            all_R.extend([0.0] * batch_len)
            all_F1.extend([0.0] * batch_len)
    return all_P, all_R, all_F1

# -----------------------------
# 5. Main Evaluation Function
# -----------------------------
def evaluate_summaries(df):
    bleu1_scores, bleu2_scores, rouge_l_scores = [], [], []
    print("Computing BLEU and ROUGE-L scores...")
    
    with tqdm(total=len(df), desc="Processing Rows", unit="row") as pbar:
        for _, row in df.iterrows():
            # Replace 'target' with 'output' since your CSV has ground truth in 'output'
            reference = clean_text(row['output'])
            candidate = clean_text(row['generated_summary'])
            
            if not reference or not candidate:
                print(f"Empty text - Reference: '{reference}', Candidate: '{candidate}'")
                bleu1_scores.append(0.0)
                bleu2_scores.append(0.0)
                rouge_l_scores.append(0.0)
            else:
                bleu1, bleu2 = compute_bleu_scores(reference, candidate)
                bleu1_scores.append(bleu1)
                bleu2_scores.append(bleu2)
                rouge_l_scores.append(compute_rouge_l(reference, candidate))
            
            pbar.update(1)
    
    print("\nComputing BERTScore...")
    # For BERTScore, we need lists of references and candidates
    references = [clean_text(text) for text in df['output'].tolist()]
    candidates = [clean_text(text) for text in df['generated_summary'].tolist()]
    
    bert_p, bert_r, bert_f1 = compute_bert_score_batched(references, candidates)
    
    # Add all scores to DataFrame
    df['bleu1'] = bleu1_scores
    df['bleu2'] = bleu2_scores
    df['rouge_l'] = rouge_l_scores
    df['bert_p'] = bert_p
    df['bert_r'] = bert_r
    df['bert_f1'] = bert_f1
    
    # Print evaluation metrics
    print("\nEvaluation Metrics (in percentages):")
    print("Average BLEU-1:", df['bleu1'].mean(), "%")
    print("Average BLEU-2:", df['bleu2'].mean(), "%")
    print("Average ROUGE-L:", df['rouge_l'].mean(), "%")
    print("Average BERT P:", df['bert_p'].mean(), "%")
    print("Average BERT R:", df['bert_r'].mean(), "%")
    print("Average BERT F1:", df['bert_f1'].mean(), "%")
    
    # Print standard deviations
    print("\nStandard Deviations (in percentages):")
    print("BLEU-1 Std:", df['bleu1'].std(), "%")
    print("BLEU-2 Std:", df['bleu2'].std(), "%")
    print("ROUGE-L Std:", df['rouge_l'].std(), "%")
    print("BERT F1 Std:", df['bert_f1'].std(), "%")
    print("BERT P Std:", df['bert_p'].std(), "%")  # Print standard deviation for BERT P
    print("BERT R Std:", df['bert_r'].std(), "%")  # Print standard deviation for BERT R
    
    return df

# -----------------------------
# 6. Load CSV & Evaluate
# -----------------------------
if __name__ == "__main__":
    # Load the CSV with 'input', 'output', 'generated_summary'
    df_summaries = pd.read_csv("soap_generated_summaries.csv")
    print(df_summaries.head())

    # Evaluate
    results_df = evaluate_summaries(df_summaries)

    # Save the DataFrame with the metrics
    results_df.to_csv("soap_evaluation_results.csv", index=False)
    print("\nResults saved to 'soap_evaluation_results.csv'")
