In [1]:
import pandas as pd
df_sample = pd.read_csv("sample_summary.csv")
# Display the first few rows
print(df_sample.head())

# Check DataFrame info
print(df_sample.info())

                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  
0  Subjective:\n- Symptoms: Lower back pain, radi...  
1  Subjective:\n- Presenting with dry cough for 1...  
2  Subjective:\n- No known allergies to medicatio...  
3  Subjective:\n- Fever and dry cough started 4 d...  
4  Subjective:\n- Presenting with chest pain for ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   100 non-null    object
 1   output  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB
None


In [2]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [3]:
!pip install -q transformers huggingface_hub langchain_community
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes
!pip install -q neo4j
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [4]:
import torch
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import os

In [5]:
import time
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# -------------------------------
# 1. Load model and tokenizer
# -------------------------------
model_name = "google-t5/t5-large"  # You can replace this with your own fine-tuned model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------------
# 2. Load DataFrame (first 2 rows)
# -------------------------------
df_sample = df_sample.iloc[:2].copy()
inputs_list = df_sample["input"].tolist()

# -------------------------------
# 3. Few-shot example + prompt constructor
# -------------------------------
few_shot_examples = [
    {
        "input": "<SEX> F <SERVICE> ONCOLOGY <CHIEF COMPLAINT> worsening back pain <HISTORY OF PRESENT ILLNESS> The patient is a 45-year-old female with a history of metastatic breast cancer presenting with worsening back pain...",
        "target": "A 45-year-old female with metastatic breast cancer presented with worsening back pain. Imaging showed thoracic spine fractures..."
    }
]

def construct_prompt(input_text):
    prompt = "You are a medical expert. Please summarize the following input concisely:\n\n"
    for ex in few_shot_examples:
        prompt += f"Input: {ex['input']}\nTarget: {ex['target']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# -------------------------------
# 4. Generation config
# -------------------------------
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.1,
    "top_k": 50,
    "max_new_tokens": 200,
    "repetition_penalty": 1.1,
    "eos_token_id": tokenizer.eos_token_id,
}

# -------------------------------
# 5. Summarization + Metrics
# -------------------------------
generated_summaries = []
latencies, throughputs = [], []

print(f"Processing {len(inputs_list)} inputs...\n")

for idx, text in enumerate(tqdm(inputs_list, desc="Generating Summaries", unit="row")):
    prompt = construct_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    prompt_length = inputs["input_ids"].shape[1]

    start_time = time.time()
    outputs = model.generate(**inputs, **generation_params)
    end_time = time.time()

    # Extract generated tokens only
    generated_tokens = outputs[0, prompt_length:]
    summary = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

    # Save results
    generated_summaries.append(summary)

    # Compute metrics
    latency = end_time - start_time
    total_tokens = prompt_length + generated_tokens.shape[0]
    latencies.append(latency)
    throughputs.append(total_tokens / latency)

# -------------------------------
# 6. Store & Evaluate
# -------------------------------
df_sample["generated_summary"] = generated_summaries
df_sample.to_csv("Biobart_soap_generated_summaries.csv", index=False)
print("✔️ Summaries saved to 'Biobart_soap_generated_summaries.csv'")

# -------------------------------
# 7. Print Efficiency Metrics
# -------------------------------
print("\n🔹 Efficiency Metrics")
print(f"📌 Average Latency: {np.mean(latencies):.4f} sec (±{np.std(latencies):.4f})")
print(f"📌 Average Throughput: {np.mean(throughputs):.2f} tokens/sec (±{np.std(throughputs):.2f})")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Processing 2 inputs...



Generating Summaries: 100%|██████████| 2/2 [00:11<00:00,  5.60s/row]

✔️ Summaries saved to 'Biobart_soap_generated_summaries.csv'

🔹 Efficiency Metrics
📌 Average Latency: 5.5957 sec (±3.2815)
📌 Average Throughput: 253.13 tokens/sec (±189.37)





In [7]:
df_sample.head(3)

Unnamed: 0,input,output,generated_summary
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...",
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,


In [13]:
# ✅ Define the local file path in SageMaker's instance storage
output_csv_path = "summarization_output.csv"  # Saves in the current working directory

# ✅ Save the DataFrame to a CSV file
mimic_iv_bhc_100.to_csv(output_csv_path, index=False)

# ✅ Confirm file save location
print(f"\nSummaries saved to '{output_csv_path}'")



Summaries saved to 'summarization_output.csv'
