In [1]:
!pip install nltk bert-score
!pip install rouge-metric

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: nltk, bert-score
Successfully installed bert-score-0.3.13 nltk-3.9.1
Collecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl.metadata (9.5 kB)
Downloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
Installing collected packages: rouge-metric
Successfully installed rouge-metric-1.0.1


In [2]:
import pandas as pd

# Load the saved CSV file
gemma_3_1b_generated_summaries = pd.read_csv("gemma_3_1b_generated_summaries.csv")

# Verify the data
print(gemma_3_1b_generated_summaries.head(2))

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   

   target_tokens                       generated_summary_gemma_3_1b  
0             75  A ___ woman with a history of obesity, multipl...  
1           1143  A 55-year-old male with a past myocardial infa...  


In [3]:
from tqdm import tqdm
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score
from rouge_metric import PyRouge  # Or use HuggingFace version if needed

def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    return ' '.join(text.strip().lower().split())

def compute_bleu_scores(reference, candidate):
    try:
        smoothing = SmoothingFunction().method1
        bleu1 = sentence_bleu([reference.split()], candidate.split(), weights=(1.0, 0, 0, 0), smoothing_function=smoothing)
        bleu2 = sentence_bleu([reference.split()], candidate.split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
        return bleu1 * 100, bleu2 * 100
    except Exception as e:
        print(f"BLEU Error: {e}")
        return 0.0, 0.0

def compute_rouge_l(reference, candidate):
    rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=False,
                    rouge_w_weight=1.2, rouge_s=False, rouge_su=False, skip_gap=4)
    try:
        scores = rouge.evaluate([candidate], [[reference]])
        return scores['rouge-l']['f'] * 100
    except Exception as e:
        print(f"ROUGE-L Error: {e}")
        return 0.0

def compute_bert_score_batched(references, candidates, batch_size=32):
    all_P, all_R, all_F1 = [], [], []
    for i in range(0, len(references), batch_size):
        refs = references[i:i + batch_size]
        cands = candidates[i:i + batch_size]
        try:
            P, R, F1 = score(cands, refs, lang="en", verbose=False)
            all_P.extend([p * 100 for p in P.tolist()])
            all_R.extend([r * 100 for r in R.tolist()])
            all_F1.extend([f * 100 for f in F1.tolist()])
        except Exception as e:
            print(f"BERTScore Error in batch {i}: {e}")
            all_P.extend([0.0] * len(refs))
            all_R.extend([0.0] * len(refs))
            all_F1.extend([0.0] * len(refs))
    return all_P, all_R, all_F1

def evaluate_summaries(df):
    bleu1_scores, bleu2_scores, rouge_l_scores = [], [], []
    print("Computing BLEU and ROUGE-L scores...")

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
        reference = clean_text(row['target'])
        candidate = clean_text(row['generated_summary_gemma_3_1b']) 

        if not reference or not candidate:
            bleu1_scores.append(0.0)
            bleu2_scores.append(0.0)
            rouge_l_scores.append(0.0)
        else:
            bleu1, bleu2 = compute_bleu_scores(reference, candidate)
            rouge_l = compute_rouge_l(reference, candidate)
            bleu1_scores.append(bleu1)
            bleu2_scores.append(bleu2)
            rouge_l_scores.append(rouge_l)

    print("\nComputing BERTScore...")
    references = [clean_text(text) for text in df['target']]
    candidates = [clean_text(text) for text in df['generated_summary_gemma_3_1b']]
    bert_p, bert_r, bert_f1 = compute_bert_score_batched(references, candidates)

    df['bleu1'] = bleu1_scores
    df['bleu2'] = bleu2_scores
    df['rouge_l'] = rouge_l_scores
    df['bert_p'] = bert_p
    df['bert_r'] = bert_r
    df['bert_f1'] = bert_f1

    print("\nEvaluation Metrics (in percentages):")
    print("BLEU-1 Avg:", df['bleu1'].mean(), "%")
    print("BLEU-2 Avg:", df['bleu2'].mean(), "%")
    print("ROUGE-L Avg:", df['rouge_l'].mean(), "%")
    print("BERT F1 Avg:", df['bert_f1'].mean(), "%")

    return df

# 🔹 Load CSV and run
gemma_3_lb_generated_summaries = pd.read_csv("gemma_3_1b_generated_summaries.csv")
gemma_3_lb_generated_summaries = evaluate_summaries(gemma_3_lb_generated_summaries)

# 🔹 Save results
gemma_3_lb_generated_summaries.to_csv("gemma_3_1b_evaluation_results.csv", index=False)
print("Saved to 'gemma_3_1b_evaluation_results.csv'")


Matplotlib is building the font cache; this may take a moment.


Computing BLEU and ROUGE-L scores...


Processing Rows: 100%|██████████| 100/100 [00:02<00:00, 38.82it/s]



Computing BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


Evaluation Metrics (in percentages):
BLEU-1 Avg: 7.887237138640634 %
BLEU-2 Avg: 3.199287568537055 %
ROUGE-L Avg: 9.84704143609692 %
BERT F1 Avg: 79.77958482503891 %
Saved to 'gemma_3_1b_evaluation_results.csv'
