In [None]:
import pandas as pd

# Load the saved CSV file
generated_summaries = pd.read_csv("Soap_ContextualOutput.csv")

# Verify the data
print(generated_summaries.head(2))

In [None]:
# Take a random sample of 20 rows
sampled_20 = generated_summaries.sample(n=20, random_state=42).reset_index(drop=True)

In [None]:
# Install Hugging Face Transformers
!pip install -q huggingface_hub
!pip install -q transformers
!pip install -q sacremoses
!pip install -q bitsandbytes accelerate

In [None]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------
df = sampled_20.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    attn_implementation="eager"
).eval()


In [None]:
def build_prompt(reference, generated):
    return f"""
You are a clinical expert. Evaluate the quality of the generated summary by comparing it to the reference summary. Rate each criterion from 1 (poor) to 5 (excellent):

1. Main Idea Retention: Does the generated summary capture the main ideas of the reference?
2. Coherence: Is the generated summary logically and grammatically coherent?
3. Factual Consistency: Is the generated summary factually correct compared to the reference?

---

Reference Summary:
{reference}

Generated Summary:
{generated}

Respond in this format:
Main Idea Retention: #
Coherence: #
Factual Consistency: #
Overall Comments: ...
"""

import re

# Store results
results = []

# Inference loop
for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = build_prompt(row["output"], row["generated_summary"])
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse numeric scores using regex
    matches = re.findall(r'(?i)(Main Idea Retention|Coherence|Factual Consistency):\s*(\d)', decoded_output)
    scores = {k.strip().lower().replace(" ", "_"): int(v) for k, v in matches}
    
    # Append the original fields and scores
    scores.update({
        "input": row["input"],
        "target": row["output"],
        "generated_summary": row["generated_summary"]
    })
    results.append(scores)

# Convert to DataFrame
judge_df = pd.DataFrame(results)

# Add average score column
judge_df["avg_score"] = judge_df[
    ["main_idea_retention", "coherence", "factual_consistency"]
].mean(axis=1)

# Save results if needed
judge_df.to_csv("llm_judge_scores_gemma.csv", index=False)


In [None]:
# Formatting function
def fmt(x):
    return f"{x.mean():.2f} $\\pm$ {x.std():.2f}"

# Just print the formatted values only (no model/dataset name)
def print_latex_values_only(df):
    mi = fmt(df["main_idea_retention"])
    co = fmt(df["coherence"])
    fa = fmt(df["factual_consistency"])
    avg = fmt(df["avg_score"])
    print(f"{mi} & {co} & {fa} & {avg} \\\\")

# 🟢 Run it on your current results
print_latex_values_only(judge_df)