In [1]:
!pip install -q nltk bert-score
!pip install -q rouge-metric

In [2]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'mimicivliza'  # Replace with your bucket name
mimic_iv_bhc = f's3://{bucket_name}/sample_data_100.csv'

# Load the files
mimic_iv_bhc_100 = pd.read_csv(mimic_iv_bhc)

# Display the data
mimic_iv_bhc_100.head(20)

Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,1195,75
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,Mr. ___ is a ___ yo man with CAD with prior MI...,3496,1143
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...,5591,1098
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,"On ___, Ms. ___ was admitted to the gynecology...",1119,221
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,Mr. ___ underwent an angiogram on ___ which sh...,3307,439
5,16956007-DS-20,<SEX> M <SERVICE> SURGERY <ALLERGIES> Codeine ...,Mr. ___ is a ___ who underwent an exploratory ...,4168,1209
6,16919911-DS-15,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Penicil...,This is a ___ year old female with a recent di...,2059,208
7,15682570-DS-25,<SEX> M <SERVICE> MEDICINE <ALLERGIES> No Know...,"___ w/ h/o CAD ___ CABG LIMA to LAD, SVG to D1...",2215,451
8,12135369-DS-24,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Compazi...,Ms ___ is a ___ year old woman with a history ...,2132,416
9,11906321-DS-20,<SEX> M <SERVICE> NEUROSURGERY <ALLERGIES> Pat...,The patient was admitted to the neurosurgery s...,2347,316


In [3]:
from huggingface_hub import login
import os

# Use token from environment variable (safer)
login(os.getenv("HF_TOKEN"))

In [4]:
!pip install -q transformers huggingface_hub langchain_community
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes
!pip install -q neo4j
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [5]:
import torch
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import os

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import pandas as pd

# Step 1: Load dataset from S3 bucket
bucket_name = 'mimicivliza'  # Replace with your actual bucket name
file_path = f's3://{bucket_name}/sample_data_100.csv'

# Load the CSV file into a Pandas DataFrame
mimic_iv_bhc_100 = pd.read_csv(file_path)

# Step 2: Load the Flan-T5-Large model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Few-shot examples for prompt construction
few_shot_examples = [
    {
        "input": "<SEX> F <SERVICE> ONCOLOGY <CHIEF COMPLAINT> worsening back pain <HISTORY OF PRESENT ILLNESS> The patient is a 45-year-old female with a history of metastatic breast cancer presenting with worsening back pain over the last two weeks. Imaging revealed compression fractures in the thoracic spine. She reported increasing discomfort despite over-the-counter pain relievers. Neurological exam was unremarkable, and there were no signs of cord compression. Pain management and radiation oncology were consulted, and palliative radiation therapy was planned. The patient also discussed advanced care planning during her stay.",
        "target": "A 45-year-old female with metastatic breast cancer presented with worsening back pain. Imaging showed thoracic spine fractures, and she received palliative radiation therapy."
    },
    {
        "input": "<SEX> M <SERVICE> CARDIOLOGY <CHIEF COMPLAINT> chest pain <HISTORY OF PRESENT ILLNESS> A 55-year-old male presented with chest pain radiating to his left arm and jaw. He reported associated shortness of breath and nausea. Initial ECG showed ST-segment elevation in the inferior leads, and troponin levels were elevated, consistent with an acute myocardial infarction. The patient was emergently taken to the catheterization lab, where coronary angiography revealed 100% occlusion of the right coronary artery. A drug-eluting stent was successfully placed, and dual antiplatelet therapy was initiated. He was monitored in the cardiac care unit for 48 hours with no complications.",
        "target": "A 55-year-old male with a myocardial infarction underwent PCI with stenting of the right coronary artery. He was monitored and started on dual antiplatelet therapy."
    }
]

# Function to construct few-shot prompts
def construct_few_shot_prompt(input_text):
    prompt = "You are a medical expert. Please summarize the following input concisely:\n\n"
    for example in few_shot_examples:
        prompt += f"Input: {example['input']}\nTarget: {example['target']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# Step 3: Generate summaries using Flan-T5
generated_summaries = []

for input_text in tqdm(mimic_iv_bhc_100['input'], desc="Generating Summaries"):
    # Construct the few-shot prompt
    prompt = construct_few_shot_prompt(input_text)
    
    # Tokenize the prompt
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,  # Flan-T5 Large supports up to 512 tokens
        truncation=True,
        padding="max_length"
    )
    
    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,  # Adjust based on desired summary length
        num_beams=5,  # Beam search for better results
        repetition_penalty=1.2,  # Prevent excessive repetition
        early_stopping=True
    )
    
    # Decode and store the summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)

# Step 4: Add generated summaries to the DataFrame
mimic_iv_bhc_100['generated_summary_flan_t5'] = generated_summaries

# Optional: Save the DataFrame with generated summaries for later evaluation
#output_file = f"s3://{bucket_name}/flan_t5_generated_summaries.csv"
#mimic_iv_bhc_100.to_csv(output_file, index=False)

#print(f"\nSummaries generated and saved to '{output_file}'")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generating Summaries: 100%|██████████| 100/100 [12:57<00:00,  7.77s/it]


PermissionError: User: arn:aws:sts::867344433658:assumed-role/AmazonSageMakerServiceCatalogProductsUseRole/SageMaker is not authorized to perform: s3:PutObject on resource: "arn:aws:s3:::mimicivliza/flan_t5_generated_summaries.csv" because no identity-based policy allows the s3:PutObject action

In [None]:
mimic_iv_bhc_100

In [7]:
# ✅ Define the local file path in SageMaker's instance storage
output_csv_path = "summarization_output.csv"  # Saves in the current working directory

# ✅ Save the DataFrame to a CSV file
mimic_iv_bhc_100.to_csv(output_csv_path, index=False)

# ✅ Confirm file save location
print(f"\nSummaries saved to '{output_csv_path}'")



Summaries saved to 'summarization_output.csv'
