In [1]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'mimicivliza'  # Replace with your bucket name
mimic_iv_bhc = f's3://{bucket_name}/sample_data_100.csv'

# Load the files
mimic_iv_bhc_100 = pd.read_csv(mimic_iv_bhc)

# Display the data
mimic_iv_bhc_100.head(2)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,1195,75
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,Mr. ___ is a ___ yo man with CAD with prior MI...,3496,1143


In [2]:
import pandas as pd

# Load the saved CSV file
generated_summaries = pd.read_csv("summarization_output.csv")

# Verify the data
print(generated_summaries.head(2))

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   

                                        reduced_text  \
0  <|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...   
1  <|begin_of_text|><SEX> M <SERVICE> MEDICINE <A...   

                                            entities  \
0  {'PROBLEM': ['101', '7 pound weight loss', 'a ...   
1  {'PROBLEM': ['+', '-', '1 cm area', 'a " cyst ...   

                                            problems  \
0  ['101', '7 pound weight loss', 'a fever', 'a l...   
1  ['+', '-', '1 cm area', 'a " cyst "', 'a 2cm d...   

                                          treatments  \
0  ['abdominal exercises', 'albuterol sulfate', '...   
1  ['a bankart repair', 'a nicotine patch', 'a st...   

                                               tests  \
0         ['b12', 'bmi', 'calcium', 'physical exam'

In [3]:
# Merge on 'note_id' to bring in the 'target' column from mimic_iv_bhc_100
generated_summaries = generated_summaries.merge(
    mimic_iv_bhc_100[['note_id', 'target']], on='note_id', how='left'
)

# Keep only the desired columns
generated_summaries = generated_summaries[['note_id', 'input', 'generated_summary', 'target']]

# Display the result
generated_summaries.head()


Unnamed: 0,note_id,input,generated_summary,target
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,For a 51-year-old female patient who underwent...,This is a ___ yo F admitted to the hospital af...
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,"The patient presents with painless jaundice, i...",Mr. ___ is a ___ yo man with CAD with prior MI...
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,A 25-year-old male patient presents with sympt...,Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,The patient presents with symptoms of vulvovag...,"On ___, Ms. ___ was admitted to the gynecology..."
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,The patient presents with symptoms of chronic ...,Mr. ___ underwent an angiogram on ___ which sh...


In [14]:
# Take a random sample of 25 rows
sampled_20 = generated_summaries.sample(n=20, random_state=42).reset_index(drop=True)

In [5]:
from huggingface_hub import login

# Use your Hugging Face token
login("")

In [6]:
# Install Hugging Face Transformers
!pip install -q transformers
!pip install -q sacremoses
!pip install -q bitsandbytes accelerate

In [12]:
!pip install tqdm
from tqdm import tqdm




In [15]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------
df = sampled_20.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()


In [16]:
def build_prompt(reference, generated):
    return f"""
You are a clinical expert. Evaluate the quality of the generated summary by comparing it to the reference summary. Rate each criterion from 1 (poor) to 5 (excellent):

1. Main Idea Retention: Does the generated summary capture the main ideas of the reference?
2. Coherence: Is the generated summary logically and grammatically coherent?
3. Factual Consistency: Is the generated summary factually correct compared to the reference?

---

Reference Summary:
{reference}

Generated Summary:
{generated}

Respond in this format:
Main Idea Retention: #
Coherence: #
Factual Consistency: #
Overall Comments: ...
"""

import re

# Store results
results = []

# Inference loop
for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = build_prompt(row["target"], row["generated_summary"])
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse numeric scores using regex
    matches = re.findall(r'(?i)(Main Idea Retention|Coherence|Factual Consistency):\s*(\d)', decoded_output)
    scores = {k.strip().lower().replace(" ", "_"): int(v) for k, v in matches}
    
    # Append the original fields and scores
    scores.update({
        "note_id": row.get("note_id", ""),
        "input": row["input"],
        "target": row["target"],
        "generated_summary": row["generated_summary"]
    })
    results.append(scores)

# Convert to DataFrame
judge_df = pd.DataFrame(results)

# Add average score column
judge_df["avg_score"] = judge_df[
    ["main_idea_retention", "coherence", "factual_consistency"]
].mean(axis=1)

# Save results if needed
judge_df.to_csv("llm_judge_scores_gemma.csv", index=False)


100%|██████████| 20/20 [12:06<00:00, 36.32s/it]


In [None]:
# Formatting function
def fmt(x):
    return f"{x.mean():.2f} $\\pm$ {x.std():.2f}"

# Just print the formatted values only (no model/dataset name)
def print_latex_values_only(df):
    mi = fmt(df["main_idea_retention"])
    co = fmt(df["coherence"])
    fa = fmt(df["factual_consistency"])
    avg = fmt(df["avg_score"])
    print(f"{mi} & {co} & {fa} & {avg} \\\\")

# 🟢 Run it on your current results
print_latex_values_only(judge_df)