In [1]:
import pandas as pd

# Load the saved CSV file
generated_summaries = pd.read_csv("generated_summaries_100.csv")

# Verify the data
print(generated_summaries.head(2))

          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   

   target_tokens                                  generated_summary  
0             75  A 60-year-old male with a history of hypertens...  
1           1143  A 45-year-old female with metastatic breast ca...  


In [3]:
# Take a random sample of 25 rows
sampled_20 = generated_summaries.sample(n=20, random_state=42).reset_index(drop=True)

In [5]:
!pip install -q huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Installing collected packages: huggingface_hub
Successfully installed huggingface_hub-0.30.2


In [6]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [7]:
# Install Hugging Face Transformers
!pip install -q huggingface_hub
!pip install -q transformers
!pip install -q sacremoses
!pip install -q bitsandbytes accelerate

In [8]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------
df = sampled_20.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [9]:
def build_prompt(reference, generated):
    return f"""
You are a clinical expert. Evaluate the quality of the generated summary by comparing it to the reference summary. Rate each criterion from 1 (poor) to 5 (excellent):

1. Main Idea Retention: Does the generated summary capture the main ideas of the reference?
2. Coherence: Is the generated summary logically and grammatically coherent?
3. Factual Consistency: Is the generated summary factually correct compared to the reference?

---

Reference Summary:
{reference}

Generated Summary:
{generated}

Respond in this format:
Main Idea Retention: #
Coherence: #
Factual Consistency: #
Overall Comments: ...
"""

import re

# Store results
results = []

# Inference loop
for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = build_prompt(row["target"], row["generated_summary"])
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse numeric scores using regex
    matches = re.findall(r'(?i)(Main Idea Retention|Coherence|Factual Consistency):\s*(\d)', decoded_output)
    scores = {k.strip().lower().replace(" ", "_"): int(v) for k, v in matches}
    
    # Append the original fields and scores
    scores.update({
        "note_id": row.get("note_id", ""),
        "input": row["input"],
        "target": row["target"],
        "generated_summary": row["generated_summary"]
    })
    results.append(scores)

# Convert to DataFrame
judge_df = pd.DataFrame(results)

# Add average score column
judge_df["avg_score"] = judge_df[
    ["main_idea_retention", "coherence", "factual_consistency"]
].mean(axis=1)

# Save results if needed
judge_df.to_csv("llm_judge_scores_gemma.csv", index=False)


100%|██████████| 20/20 [13:08<00:00, 39.43s/it]


In [12]:
judge_df.head(20)

Unnamed: 0,note_id,input,target,generated_summary,main_idea_retention,coherence,factual_consistency,avg_score
0,16002586-DS-12,<SEX> M <SERVICE> MEDICINE <ALLERGIES> sulfa d...,"___ M h/o chronic systolic CHF (EF 40-45%), CO...",A 45-year-old woman with metastatic breast can...,,,,
1,11882814-DS-3,<SEX> M <SERVICE> NEUROSURGERY <ALLERGIES> No ...,Pt electively presented and underwent a subocc...,A 60-year-old male underwent suboccipital cran...,5.0,4.0,5.0,4.666667
2,19299068-DS-16,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Tegreto...,"___ w/dCHF, severe pulmonary HTN, interstital ...",A 60-year-old man with a history of hypertensi...,,,,
3,18782032-DS-30,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Penicil...,"___ yo F with Asthma, HTN, OSA and GERD who pr...",A 60-year-old man with a history of hypertensi...,,,,
4,18435530-DS-8,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Patient...,___ year old female with a history of heavy to...,A 60-year-old man with a history of hypertensi...,,,,
5,10641243-DS-9,<SEX> M <SERVICE> SURGERY <ALLERGIES> ___ <ATT...,"Mr. ___ is a ___ year old male, with a PMH sig...",A 60-year-old male presents with progressive s...,,,,
6,10913302-DS-60,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Sulfa (...,"___ with AML s/p unrelated alloSCT c/b GVHD, h...",A 45-year-old woman with metastatic breast can...,5.0,4.0,5.0,4.666667
7,10144359-DS-15,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Bactrim...,TRANSITIONAL ISSUES: -Patient will need contin...,A 60-year-old male with a history of hypertens...,,,,
8,17384661-DS-20,<SEX> M <SERVICE> UROLOGY <ALLERGIES> No Known...,Mr. ___ was admitted to Dr. ___ Urology servic...,The patient underwent cystoscopy and transuret...,1.0,2.0,2.0,1.666667
9,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,A 60-year-old male with a history of hypertens...,5.0,4.0,5.0,4.666667


In [14]:
# Add dataset and model labels appropriately
judge_df["dataset"] = "MIMIC-BHC"
judge_df["model"] = "LLaMA 3.2"

# Formatting function
def fmt(x):
    return f"{x.mean():.2f} $\\pm$ {x.std():.2f}"

# LaTeX row printer
def print_latex_row(dataset, model, df):
    mi = fmt(df["main_idea_retention"])
    co = fmt(df["coherence"])
    fa = fmt(df["factual_consistency"])
    avg = fmt(df["avg_score"])
    model_fmt = f"\\textbf{{{model}}}" if "ConTextual" in model else model
    print(f"{dataset} & {model_fmt} & {mi} & {co} & {fa} & {avg} \\\\")

# 🟢 Print LaTeX row for MIMIC-BHC, LLaMA 3.2
print_latex_row("MIMIC-BHC", "LLaMA 3.2", judge_df)



MIMIC-BHC & LLaMA 3.2 & 4.56 $\pm$ 1.33 & 3.78 $\pm$ 0.67 & 3.89 $\pm$ 1.17 & 4.07 $\pm$ 0.95 \\


In [12]:
import torch
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---------------------------------------
# 1. Load your sampled dataset
# ---------------------------------------
df = sampled_20.copy()

# ---------------------------------------
# 2. Load Gemma 3 1B Instruction-Tuned
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()

# ---------------------------------------
# 3. Define prompt with clearer instructions
# ---------------------------------------
def create_prompt(input_text, reference_summary, generated_summary):
    # Gemma chat format according to official docs
    return f"""<start_of_turn>user
You are a clinical NLP evaluation assistant. Please evaluate this medical summary.

Reference Summary: {reference_summary}

Generated Summary: {generated_summary}

Evaluate based on:
1. Main ideas captured (Yes/No)
2. Coherence (Yes/No) 
3. Issues/inaccuracies
4. Overall score (1-5)

Format your response EXACTLY like:
- Captures main ideas: Yes
- Coherence: Yes  
- Issues: None
- Score: 4
<end_of_turn>

<start_of_turn>model
"""

# ---------------------------------------
# 4. Evaluate single example to debug
# ---------------------------------------
print("===== DEBUGGING WITH SINGLE EXAMPLE =====")

# First get a sample with a non-empty generated summary
test_row = None
for i, row in df.iterrows():
    if not pd.isna(row['generated_summary']) and row['generated_summary'].strip() != "":
        test_row = row
        break

if test_row is not None:
    print("\nTEST EXAMPLE:")
    print(f"Reference: {test_row['target'][:100]}...")
    print(f"Generated: {test_row['generated_summary'][:100]}...")
    
    # Create prompt
    prompt = create_prompt(test_row['input'], test_row['target'], test_row['generated_summary'])
    
    # Print the prompt for debugging
    print("\nPROMPT SENT TO MODEL:")
    print(prompt[:500] + "...")
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    outputs = model.generate(
        **inputs, 
        max_new_tokens=150,
        temperature=0.1,
        repetition_penalty=1.2
    )
    
    # Get full response including prompt
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nFULL RESPONSE FROM MODEL:")
    print(full_response)
    
    # Try to extract the model's response portion
    cleaned_response = full_response
    
    # Try various cleaning methods
    print("\nTRYING DIFFERENT EXTRACTION METHODS:")
    
    # Method 1: Split by model tag
    if "<start_of_turn>model" in full_response:
        method1 = full_response.split("<start_of_turn>model", 1)[1].strip()
        print("\nMethod 1 (Split by model tag):")
        print(method1)
    else:
        print("\nMethod 1 failed: No model tag in response")
    
    # Method 2: Look for first pattern match
    method2_match = re.search(r'[Cc]aptures main ideas:\s*(\w+)', full_response)
    if method2_match:
        start_idx = method2_match.start()
        method2 = full_response[start_idx:]
        print("\nMethod 2 (First pattern match):")
        print(method2)
    else:
        print("\nMethod 2 failed: No 'Captures main ideas:' in response")
    
    # Method 3: Extract after user turn
    if "<end_of_turn>" in full_response:
        parts = full_response.split("<end_of_turn>")
        if len(parts) > 1:
            method3 = parts[1].strip()
            print("\nMethod 3 (After user turn):")
            print(method3)
        else:
            print("\nMethod 3 failed: No content after end_of_turn")
    else:
        print("\nMethod 3 failed: No end_of_turn in response")
    
    # Apply all regex patterns to entire response
    print("\nREGEX PATTERN MATCHING ON FULL RESPONSE:")
    
    main_idea_match = re.search(r'[Cc]aptures main ideas:\s*(\w+)', full_response)
    coherence_match = re.search(r'[Cc]oherence:\s*(\w+)', full_response)
    issues_match = re.search(r'[Ii]ssues:\s*(.+?)(?:\n|$)', full_response)
    score_match = re.search(r'[Ss]core:\s*(\d+)', full_response)
    
    if main_idea_match:
        print(f"Found main ideas: '{main_idea_match.group(1)}'")
        print(f"  Is 'yes'? {main_idea_match.group(1).lower() == 'yes'}")
    else:
        print("No main ideas match found")
        
    if coherence_match:
        print(f"Found coherence: '{coherence_match.group(1)}'")
        print(f"  Is 'yes'? {coherence_match.group(1).lower() == 'yes'}")
    else:
        print("No coherence match found")
        
    if issues_match:
        print(f"Found issues: '{issues_match.group(1)}'")
        print(f"  Contains 'none'? {'none' in issues_match.group(1).lower()}")
    else:
        print("No issues match found")
        
    if score_match:
        print(f"Found score: '{score_match.group(1)}'")
        try:
            score = int(score_match.group(1))
            print(f"  Parsed score: {score}")
        except:
            print("  Could not parse score as integer")
    else:
        print("No score match found")

# ---------------------------------------
# 5. Run the full evaluation with more debugging
# ---------------------------------------
print("\n===== RUNNING FULL EVALUATION WITH MORE DEBUGGING =====")
results = []
parsed_results = {
    'main_idea': [],
    'coherence': [],
    'factuality': [],
    'score': []
}

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Evaluating summaries"):
    try:
        # Skip rows with missing summaries
        if pd.isna(row['generated_summary']) or row['generated_summary'].strip() == "":
            print(f"Skipping row {index} - empty generated summary")
            results.append("EMPTY SUMMARY")
            parsed_results['main_idea'].append(False)
            parsed_results['coherence'].append(False)
            parsed_results['factuality'].append(False)
            parsed_results['score'].append(0)
            continue
            
        # Create the prompt
        prompt = create_prompt(row['input'], row['target'], row['generated_summary'])
        
        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
        outputs = model.generate(
            **inputs, 
            max_new_tokens=150,
            temperature=0.1,
            repetition_penalty=1.2
        )
        
        # Get the full response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append(full_response)  # Store full response for debugging
        
        # Apply all regex patterns to entire response
        main_idea_match = re.search(r'[Cc]aptures main ideas:\s*(\w+)', full_response)
        coherence_match = re.search(r'[Cc]oherence:\s*(\w+)', full_response)
        issues_match = re.search(r'[Ii]ssues:\s*(.+?)(?:\n|$)', full_response)
        score_match = re.search(r'[Ss]core:\s*(\d+)', full_response)
        
        # Print pattern matches for first few examples
        if index < 3:
            print(f"\nResponse patterns for example {index}:")
            print(f"Main ideas: {main_idea_match.group(0) if main_idea_match else 'Not found'}")
            print(f"Coherence: {coherence_match.group(0) if coherence_match else 'Not found'}")
            print(f"Issues: {issues_match.group(0) if issues_match else 'Not found'}")
            print(f"Score: {score_match.group(0) if score_match else 'Not found'}")
        
        # Extract values with better fallbacks
        main_idea = False
        if main_idea_match:
            main_idea = 'yes' in main_idea_match.group(1).lower()
            
        coherence_flag = False
        if coherence_match:
            coherence_flag = 'yes' in coherence_match.group(1).lower()
            
        factual = False
        if issues_match:
            issues_text = issues_match.group(1).lower().strip()
            factual = 'none' in issues_text or 'no issues' in issues_text
            
        score_val = 0
        if score_match:
            try:
                score_val = int(score_match.group(1))
                # Ensure score is in range 1-5
                score_val = max(1, min(5, score_val))
            except:
                score_val = 0
                
        # Store parsed results
        parsed_results['main_idea'].append(main_idea)
        parsed_results['coherence'].append(coherence_flag)
        parsed_results['factuality'].append(factual)
        parsed_results['score'].append(score_val)
        
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        results.append(f"ERROR: {str(e)}")
        parsed_results['main_idea'].append(False)
        parsed_results['coherence'].append(False)
        parsed_results['factuality'].append(False)
        parsed_results['score'].append(0)
    
    # Free up GPU memory
    if index % 5 == 0:
        torch.cuda.empty_cache()

# Add results to the dataframe
df['evaluation_gemma'] = results
df['main_idea'] = parsed_results['main_idea']
df['coherence'] = parsed_results['coherence']
df['factuality'] = parsed_results['factuality']
df['score'] = parsed_results['score']

# ---------------------------------------
# 6. Compute and print metrics
# ---------------------------------------
def pct_std(arr):
    a = np.array(arr).astype(float)
    return f"{100*a.mean():.1f} ± {100*a.std():.1f}"

def score_avg(arr):
    a = np.array(arr).astype(float)
    return f"{a.mean():.2f} ± {a.std():.2f}"

print("\n📊 Table-Ready Metrics (Gemma 3 as Judge):")
print("Main Ideas:", pct_std(df['main_idea']))
print("Coherence:", pct_std(df['coherence']))
print("Factuality:", pct_std(df['factuality']))
print("Avg. Score:", score_avg(df['score']))

# Print count of successful extractions
print(f"\nSuccessful extractions:")
print(f"Main ideas: {sum(df['main_idea'] == True)}/{len(df)}")
print(f"Coherence: {sum(df['coherence'] == True)}/{len(df)}")
print(f"Factuality: {sum(df['factuality'] == True)}/{len(df)}")
print(f"Non-zero scores: {sum(df['score'] > 0)}/{len(df)}")

# Save your results
df.to_csv("sampled_20_evaluated_gemma3_debug.csv", index=False)
print("\nResults saved to 'sampled_20_evaluated_gemma3_debug.csv'")

===== DEBUGGING WITH SINGLE EXAMPLE =====

TEST EXAMPLE:
Reference: ___ M h/o chronic systolic CHF (EF 40-45%), COPD, basal carcinoma nevus syndrome, recent hospitaliza...
Generated: A 45-year-old woman with metastatic breast cancer presents with worsening back pain. Imaging shows c...

PROMPT SENT TO MODEL:
<start_of_turn>user
You are a clinical NLP evaluation assistant. Please evaluate this medical summary.

Reference Summary: ___ M h/o chronic systolic CHF (EF 40-45%), COPD, basal carcinoma nevus syndrome, recent hospitalization for malignant otitis externa presenting with respiratory failure requiring intubation in the ED. Treated with aggressive diuresis and improved. Extubated successfully and being discharged back to ___ for continued management of multiple medical problems. #Respiratory fail...


RuntimeError: All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 [(torch.Size([1024, 1152]), device(type='cpu'))]