In [1]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from getpass import getpass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------
# Prompt for Hugging Face token (safe for notebook use)
# ---------------------------------------
HF_TOKEN = getpass("Enter your Hugging Face token: ")

# ---------------------------------------
# Model choice (can be changed later)
# ---------------------------------------
model_name = "meta-llama/Llama-3.2-3B-Instruct"

Enter your Hugging Face token:  ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


In [3]:
# ---------------------------------------
# 4-bit quantization configuration
# ---------------------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [4]:
# ---------------------------------------
# Load tokenizer
# ---------------------------------------
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=HF_TOKEN,
)

# Important for decoder-only models
tokenizer.pad_token = tokenizer.eos_token

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import getpass

# ---------------------------------------
# Secure HF token input (no echo)
# ---------------------------------------
# HF_TOKEN = getpass.getpass("Enter HF token: ").strip()

model_name = "meta-llama/Llama-3.2-3B-Instruct"

# ---------------------------------------
# Load tokenizer
# ---------------------------------------
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=HF_TOKEN,
)
tokenizer.pad_token = tokenizer.eos_token

# ---------------------------------------
# Load model (NO quantization)
# ---------------------------------------
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True,
    token=HF_TOKEN,
)

# üî¥ REQUIRED for FocusAgent + HallucinationDetectorAgent
model.set_attn_implementation("eager")

model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:03<00:00,  1.62s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (

In [6]:
print(next(model.parameters()).device)

cuda:0


In [7]:
inputs = tokenizer("Patient was admitted with headache.", return_tensors="pt").to(model.device)

out = model.generate(
    **inputs,
    max_new_tokens=20,
    do_sample=False
)

print(tokenizer.decode(out[0], skip_special_tokens=True))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Patient was admitted with headache. The patient was given a medication that was not approved for use in the patient's age group. The


In [8]:
import os
print(os.getcwd())

/lustre/hl/users/4283/agentic_paper/CHIL/Agents_CHIL


In [9]:
import os
import pandas as pd
data_path = "../Dataset/sample_data_100.csv"
df = pd.read_csv(data_path)

In [10]:
df.head(1)

Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,1195,75


In [11]:
row = df.iloc[0]

document = row["input"]
reference = row["target"]   # NOT used for generation
note_id = row["note_id"]

print(f"Running pipeline for note_id = {note_id}")
print(f"Document length (chars): {len(document)}")

Running pipeline for note_id = 16002318-DS-17
Document length (chars): 4583


In [12]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/4283/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/4283/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
import sys
sys.path.append("/lustre/hl/users/4283/agentic_paper/CHIL/Agents_CHIL")

from focus_agent import FocusAgent
from draft_agent import DraftAgent
from HallucinationDetectorAgent import HallucinationDetectorAgent
from FixAgent import FixAgent
from ClinicalSupervisorAgent import ClinicalSupervisorAgent
from semantic_entailment_judge import SemanticEntailmentJudge

In [14]:
# ----------------------------
# Instantiate FocusAgent
# ----------------------------
focus_agent = FocusAgent(
    model=model,           # ‚úÖ NEW: pass the loaded model
    tokenizer=tokenizer,   # ‚úÖ NEW: pass the loaded tokenizer
    retention_ratio=0.7,
    batch_size=8,          # ‚úÖ NEW: optional, defaults to 8
)
# ----------------------------
# Instantiate semantic_judge
# ----------------------------

semantic_judge = SemanticEntailmentJudge(
    model=model,              # shared model OR a separate NLI model
    tokenizer=tokenizer,      # same tokenizer or NLI tokenizer
)

# ----------------------------
# Instantiate DraftAgent
# ----------------------------
draft_agent = DraftAgent(
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=400,
)

# ----------------------------
# Instantiate HallucinationDetectorAgent
# ----------------------------
hallucination_detector_agent = HallucinationDetectorAgent(
    model=model,
    tokenizer=tokenizer,
    semantic_judge=semantic_judge,
)

# ----------------------------
# Instantiate FixAgent
# ----------------------------
fix_agent = FixAgent(
    model=model,
    tokenizer=tokenizer,
)

[Init] SemanticEntailmentJudge initialized
[Init] DraftAgent initialized (deterministic mode)
[Init] HallucinationDetectorAgent initialized
[Init] AURA threshold = 0.4


In [15]:
supervisor = ClinicalSupervisorAgent(
    focus_agent=focus_agent,
    draft_agent=draft_agent,
    hallucination_detector_agent=hallucination_detector_agent,
    fix_agent=fix_agent,
    max_iterations=3,
)

###Real_data

In [16]:
row = df.iloc[80]

note_id = row["note_id"]
document = row["input"]        # THIS replaces the toy example
reference = row["target"]      # keep but don‚Äôt use yet

print("Running pipeline for note_id:", note_id)
print("Input token count:", row["input_tokens"])
print("First 500 chars of document:\n")
print(document[:5000])


Running pipeline for note_id: 10144359-DS-15
Input token count: 3803
First 500 chars of document:

<SEX> M <SERVICE> MEDICINE <ALLERGIES> Bactrim / Sulfa (Sulfonamide Antibiotics) <ATTENDING> ___. <CHIEF COMPLAINT> Fever, Cocaine intoxication <MAJOR SURGICAL OR INVASIVE PROCEDURE> None <HISTORY OF PRESENT ILLNESS> History of Present Illness ___ year old male with AIDS (CD4 93, VL unknown) who presented with one day of subjective fever, chills and cough. The patient also was apparently acutely intoxicated with cocaine on presentation to the ED. The patient had temperatures to 100.0 and given the concern of the patient having AIDS, was started on empiric vancomycin/zosyn for an unclear source. A chest xray was performed and was negative for infiltrate, blood and urine cultures were obtained. It is unclear why such broad spectrum antibiotics were started based on ED documentation as no source was noted. The patient does have a history of ADIs inclucing PCP, ___. and is coinfected with HCV

In [17]:
import gc

row = df.iloc[26]
print("\nRunning note:", row["note_id"])

# Run pipeline
output = supervisor.run(row["input"])

# ============================================================
# DETAILED OUTPUT
# ============================================================
print("\n" + "=" * 80)
print("DETAILED PIPELINE RESULTS")
print("=" * 80)

print("\nüìÑ ORIGINAL NOTE (FULL):")
print("-" * 80)
print(row["input"])

print("\nüìù DRAFT SUMMARY:")
print("-" * 80)
print(output["draft_summary"])

print("\n‚úÖ FIXED SUMMARY:")
print("-" * 80)
print(output["fixed_summary"])

print("\nüìä PIPELINE STATISTICS:")
print("-" * 80)
print(f"  ‚Ä¢ Total iterations: {output['num_iterations']}")
print(f"  ‚Ä¢ Termination reason: {output['termination_reason']}")
print(f"  ‚Ä¢ Total spans analyzed: {len(output['spans'])}")
print(f"  ‚Ä¢ Hallucinated spans detected: {len(output['hallucinated_spans'])}")

if output['hallucinated_spans']:
    print("\n‚ö†Ô∏è  DETECTED HALLUCINATIONS:")
    print("-" * 80)
    for i, span in enumerate(output['hallucinated_spans'], 1):
        print(f"{i}. {span}")
else:
    print("\n‚úÖ NO HALLUCINATIONS DETECTED")

print("\nüìã ALL SPANS (with hallucination status):")
print("-" * 80)
for i, span in enumerate(output['spans']):
    is_hallucinated = output['hallucination_mask'].get(i, 0)
    status = "‚ùå HALLUCINATED" if is_hallucinated == 1 else "‚úÖ OK"
    print(f"{i+1}. [{status}] {span}")

print("\n" + "=" * 80)

# Cleanup
gc.collect()
torch.cuda.empty_cache()


Running note: 11388716-DS-3
[Detector] State reset
[SemanticJudge] Cache reset
[Supervisor] Starting new document
[FocusAgent] Processing 68 sentences...


                                                                                                                                                                                                                                                                                

[FocusAgent] Retained 47/68 sentences
[Supervisor] FocusAgent retained 47 sentences
[Supervisor] DraftAgent generated initial summary
[Detector] Cached 20 spans
[Detector] AURA computed and cached
[Detector] Spans=20 | Hallucinated=3
[Supervisor] Iter 0 | hallucinated spans: 3 | decision: hallucinations_detected
[Detector] Using cached AURA from iteration 0
[Detector] Spans=20 | Hallucinated=3
[Supervisor] Iter 1 | hallucinated spans: 3 | decision: hallucination_pattern_stabilized
[Supervisor] Pipeline finished

DETAILED PIPELINE RESULTS

üìÑ ORIGINAL NOTE (FULL):
--------------------------------------------------------------------------------
<SEX> M <SERVICE> MEDICINE <ALLERGIES> Patient recorded as having No Known Allergies to Drugs <ATTENDING> ___. <CHIEF COMPLAINT> Left leg swelling and pain <MAJOR SURGICAL OR INVASIVE PROCEDURE> None <HISTORY OF PRESENT ILLNESS> Pt is ___ yo M with hx of of prior silent MI, HTN, HLD, transitional cell carcinoma (s/p radiation, no surgery or chem

In [None]:
import pandas as pd
from tqdm import tqdm
import gc
import torch
import sys

results = []
BATCH_SIZE = 5

# Suppress all the pipeline's internal print statements
class SuppressPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open('/dev/null', 'w')
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# Main progress bar for all 100 notes
pbar = tqdm(total=len(df), desc="Processing notes", position=0)

for batch_start in range(0, len(df), BATCH_SIZE):
    batch_end = min(batch_start + BATCH_SIZE, len(df))
    batch_df = df.iloc[batch_start:batch_end]
    
    for idx, row in batch_df.iterrows():
        try:
            # Suppress all internal prints
            with SuppressPrints():
                output = supervisor.run(row["input"])
            
            results.append({
                'note_id': row['note_id'],
                'Input': row['input'],
                'Target': row['target'],
                'DraftAgent_Summary': output.get('draft_summary', ''),
                'FixAgent_Summary': output.get('fixed_summary', ''),
                'Hallucinations_Detected': output.get('total_hallucinations', 0),
                'Iterations': output.get('total_iterations', 0)
            })
            
        except Exception as e:
            tqdm.write(f"‚ùå ERROR on note {idx+1}: {row['note_id']}")
            results.append({
                'note_id': row['note_id'],
                'Input': row['input'],
                'Target': row['target'],
                'DraftAgent_Summary': 'ERROR',
                'FixAgent_Summary': 'ERROR',
                'Hallucinations_Detected': -1,
                'Iterations': -1
            })
        
        # Update progress bar
        pbar.update(1)
    
    # Cleanup after each batch (silent)
    torch.cuda.empty_cache()
    gc.collect()
    
    # Save checkpoint (silent)
    checkpoint_df = pd.DataFrame(results)
    checkpoint_df.to_csv(f'checkpoint_batch_{batch_start//BATCH_SIZE + 1}.csv', index=False)

pbar.close()

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('pipeline_results_final.csv', index=False)
print(f"\n‚úÖ Processing complete!")
print(f"Successfully processed: {len([r for r in results if r['FixAgent_Summary'] != 'ERROR'])}/{len(df)} notes")

Google Form output

In [22]:
import torch
import gc

vignette = """
Adult male patient in good health until 10 days ago when he developed gradually worsening right-sided headaches, initially controlled with over-the-counter analgesics but now refractory. No visual changes, nausea, or drowsiness. Head CT at outside hospital revealed intracranial mass lesion. MRI showed large heterogeneous mass (5.0 x 4.3 cm) in right temporoparietal lobes with hemorrhage, cystic change, and necrosis. Patient has history of renal cell carcinoma with left nephrectomy. Underwent image-guided craniotomy for tumor resection.
"""



In [23]:
print("\n" + "="*80)
print("üìù DRAFT AGENT OUTPUT")
print("="*80)

compressed_sentences = [vignette]  # minimal, valid input
draft_summary = draft_agent.generate(compressed_sentences)

print(draft_summary)



üìù DRAFT AGENT OUTPUT
Patient admitted with worsening right-sided headaches refractory to analgesics. Head CT and MRI revealed large heterogeneous mass in right temporoparietal lobes with hemorrhage, cystic change, and necrosis. Patient underwent craniotomy for tumor resection. Post-operative period, patient developed fever, elevated white blood cell count, and prolonged hospital stay. Patient was treated with antibiotics, antifungals, and antivirals. Patient was also treated with pain management and anti-seizure medications. Patient was discharged home with follow-up appointment scheduled.


In [21]:
print("\n" + "=" * 80)
print("üìÑ FULL PIPELINE (LABELED OUTPUT)")
print("=" * 80)

output = supervisor.run(vignette)

print("\n--- üìù DRAFT SUMMARY (DraftAgent) ---\n")
print(output["draft_summary"])

print("\n--- üõ†Ô∏è FIXED SUMMARY (After Detection + FixAgent) ---\n")
print(output["fixed_summary"])

print("\n--- üìä PIPELINE METADATA ---\n")
print(f"Iterations           : {output['num_iterations']}")
print(f"Termination reason   : {output['termination_reason']}")
print(f"Total spans analyzed : {len(output['spans'])}")
print(f"Hallucinated spans   : {len(output['hallucinated_spans'])}")



üìÑ FULL PIPELINE (LABELED OUTPUT)
[Detector] State reset
[SemanticJudge] Cache reset
[Supervisor] Starting new document
[FocusAgent] Processing 8 sentences...


                                                                                                                                                                                                                                                                                

[FocusAgent] Retained 5/8 sentences
[Supervisor] FocusAgent retained 5 sentences
[Supervisor] DraftAgent generated initial summary
[Detector] Cached 21 spans
[Detector] AURA computed and cached
[Detector] Spans=21 | Hallucinated=7
[Supervisor] Iter 0 | hallucinated spans: 7 | decision: hallucinations_detected
[Detector] Using cached AURA from iteration 0
[Detector] Spans=21 | Hallucinated=7
[Supervisor] Iter 1 | hallucinated spans: 7 | decision: hallucination_pattern_stabilized
[Supervisor] Pipeline finished

--- üìù DRAFT SUMMARY (DraftAgent) ---

Patient admitted to ICU with sudden onset dyspnea, elevated BNP, lactate, and anion gap, and pulmonary edema. BiPAP support initiated. Patient's history of CAD and heart failure likely contributing factors. Further evaluation and management to be determined. 

Note: The summary should be concise and focused on the key information from the patient's record. It should not include any additional information or interpretations. 

Please provide

scale to full dataframe (with progress bar)

In [20]:
from tqdm.auto import tqdm

results = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Running AgenticSum"):
    output = supervisor.run(row["input"])

    results.append({
        "note_id": row["note_id"],
        "draft_summary": output["draft_summary"],
        "fixed_summary": output["fixed_summary"],
        "num_hallucinated_spans": len(output["hallucinated_spans"]),
        "termination_reason": output["termination_reason"],
    })


Running AgenticSum:   0%|                                                                                                                                                                                                                               | 0/100 [00:00<?, ?it/s]

[Supervisor] Starting new document
[Supervisor] FocusAgent retained 35 sentences


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Supervisor] DraftAgent generated initial summary
[Detector] Spans=11 | Hallucinated=3
[Supervisor] Iter 0 | hallucinated spans: 3 | decision: hallucinations_detected


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Running AgenticSum:   0%|                                                                                                                                                                                                                               | 0/100 [00:19<?, ?it/s]

[Detector] Spans=11 | Hallucinated=3
[Supervisor] Iter 1 | hallucinated spans: 3 | decision: hallucination_pattern_stabilized
[Supervisor] Pipeline finished





TypeError: string indices must be integers, not 'str'