In [1]:
import pandas as pd

# Load the saved CSV file
generated_summaries = pd.read_csv("MimicCPTFsummarization_results.csv")

# Verify the data
print(generated_summaries.head(2))

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   

   target_tokens                                       reduced_text  \
0             75  <|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...   
1           1143  <|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...   

                                   importance_scores  \
0  [0.010251283645629883, 0.010246990248560905, 0...   
1  [0.0059814453125, 0.005979984533041716, 0.0059...   

                                   generated_summary  
0  Follow up within 30 days after discharge via p...  
1  0.42 FETAL MEASURES - Fetal heart rate normal,...  


In [2]:
# Take a random sample of 25 rows
sampled_20 = generated_summaries.sample(n=25, random_state=42).reset_index(drop=True)


In [3]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [4]:
# Install Hugging Face Transformers
!pip install transformers
!pip install sacremoses
!pip install bitsandbytes accelerate



In [5]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------
df = sampled_20.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()


In [6]:
def build_prompt(reference, generated):
    return f"""
You are a clinical expert. Evaluate the quality of the generated summary by comparing it to the reference summary. Rate each criterion from 1 (poor) to 5 (excellent):

1. Main Idea Retention: Does the generated summary capture the main ideas of the reference?
2. Coherence: Is the generated summary logically and grammatically coherent?
3. Factual Consistency: Is the generated summary factually correct compared to the reference?

---

Reference Summary:
{reference}

Generated Summary:
{generated}

Respond in this format:
Main Idea Retention: #
Coherence: #
Factual Consistency: #
Overall Comments: ...
"""

import re

# Store results
results = []

# Inference loop
for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = build_prompt(row["target"], row["generated_summary"])
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse numeric scores using regex
    matches = re.findall(r'(?i)(Main Idea Retention|Coherence|Factual Consistency):\s*(\d)', decoded_output)
    scores = {k.strip().lower().replace(" ", "_"): int(v) for k, v in matches}
    
    # Append the original fields and scores
    scores.update({
        "note_id": row.get("note_id", ""),
        "input": row["input"],
        "target": row["target"],
        "generated_summary": row["generated_summary"]
    })
    results.append(scores)

# Convert to DataFrame
judge_df = pd.DataFrame(results)

# Add average score column
judge_df["avg_score"] = judge_df[
    ["main_idea_retention", "coherence", "factual_consistency"]
].mean(axis=1)

# Save results if needed
judge_df.to_csv("llm_judge_scores_gemma.csv", index=False)


100%|██████████| 25/25 [14:20<00:00, 34.43s/it]


In [7]:
# Formatting function
def fmt(x):
    return f"{x.mean():.2f} $\\pm$ {x.std():.2f}"

# Just print the formatted values only (no model/dataset name)
def print_latex_values_only(df):
    mi = fmt(df["main_idea_retention"])
    co = fmt(df["coherence"])
    fa = fmt(df["factual_consistency"])
    avg = fmt(df["avg_score"])
    print(f"{mi} & {co} & {fa} & {avg} \\\\")

# 🟢 Run it on your current results
print_latex_values_only(judge_df)


3.93 $\pm$ 1.33 & 3.38 $\pm$ 0.77 & 3.77 $\pm$ 1.09 & 3.73 $\pm$ 0.87 \\
