In [1]:
import pandas as pd

# Load the saved CSV file
generated_summaries = pd.read_csv("generated_summaries_100.csv")

# Verify the data
print(generated_summaries.head(2))

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   

   target_tokens                                  generated_summary  
0             75  A 60-year-old male with a history of hypertens...  
1           1143  A 45-year-old female with metastatic breast ca...  


In [2]:
# Take a random sample of 50 rows
sampled_50 = generated_summaries.sample(n=50, random_state=42).reset_index(drop=True)


In [3]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [4]:
# Install Hugging Face Transformers
!pip install transformers
!pip install sacremoses
!pip install bitsandbytes accelerate



In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------

df = sampled_50.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()

# ---------------------------------------
# 3. Define prompt
# ---------------------------------------
def create_prompt(input_text, reference_summary, generated_summary):
    return f"""You are a helpful clinical NLP evaluation assistant.

Input Text:
{input_text}

Reference Summary:
{reference_summary}

Generated Summary:
{generated_summary}

Evaluate the generated summary using the following criteria:
1. Does it capture the main ideas of the reference summary? (Yes/No)
2. Is it coherent and logically structured? (Yes/No)
3. Are there factual inaccuracies or important omissions? (List any)
4. Rate the summary from 1 to 5 based on how well it captures the reference summary.

Please give your evaluation in this format:
- Captures main ideas: [Yes/No]
- Coherence: [Yes/No]
- Issues: [Write here or 'None']
- Score: [1-5]
"""

# ---------------------------------------
# 4. Evaluate each example
# ---------------------------------------
results = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    prompt = create_prompt(row['input'], row['target'], row['generated_summary'])
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=150)
    eval_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    results.append(eval_response)

df['evaluation_gemma'] = results

# ---------------------------------------
# 5. Parse LLM judgments into structured flags
# ---------------------------------------
main_ideas, coherence, factuality, scores = [], [], [], []

for eval in df['evaluation_gemma']:
    try:
        main_idea = "yes" in eval.lower().split("captures main ideas:")[1].split("\n")[0]
        coherence_flag = "yes" in eval.lower().split("coherence:")[1].split("\n")[0]
        issue_line = eval.lower().split("issues:")[1].split("\n")[0]
        factual = "none" in issue_line
        score_val = int(eval.lower().split("score:")[1].split("\n")[0].strip()[0])
    except Exception as e:
        print("Error parsing:", eval)
        main_idea, coherence_flag, factual, score_val = False, False, False, 0

    main_ideas.append(main_idea)
    coherence.append(coherence_flag)
    factuality.append(factual)
    scores.append(score_val)

df['main_idea'] = main_ideas
df['coherence'] = coherence
df['factuality'] = factuality
df['score'] = scores

# ---------------------------------------
# 6. Compute table-ready metrics
# ---------------------------------------
def pct_std(arr):
    a = np.array(arr).astype(float)
    return f"{100*a.mean():.1f} ± {100*a.std():.1f}"

def score_avg(arr):
    a = np.array(arr).astype(float)
    return f"{a.mean():.1f} ± {a.std():.1f}"

print("\n📊 Table-Ready Metrics (Gemma 3 as Judge):")
print("Main Ideas:", pct_std(df['main_idea']))
print("Coherence:", pct_std(df['coherence']))
print("Factuality:", pct_std(df['factuality']))
print("Avg. Score:", score_avg(df['score']))

# ---------------------------------------
# 7. Save final results (optional)
# ---------------------------------------
df.to_csv("sampled_50_evaluated_gemma3.csv", index=False)


 70%|███████   | 35/50 [15:46<05:39, 22.62s/it]