In [None]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'mimicivliza'  # Replace with your bucket name
mimic_iv_bhc = f's3://{bucket_name}/sample_data_100.csv'

# Load the files
mimic_iv_bhc_100 = pd.read_csv(mimic_iv_bhc)

# Display the data
mimic_iv_bhc_100.head(20)

In [None]:
# Install Hugging Face Transformers
!pip install -q transformers
!pip install -q sacremoses
!pip install -q bitsandbytes accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,           # or load_in_4bit=True for more compression
    llm_int8_threshold=6.0,      # default threshold for 8-bit
    llm_int8_skip_modules=None,  # or specify modules to skip
    bnb_8bit_compute_dtype="float16"  # use float16 for faster inference
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT-Large")
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/BioGPT-Large",
    quantization_config=bnb_config,
    device_map="auto"  # use this if you're on a GPU or want automatic device placement
)



In [None]:
from tqdm import tqdm
import torch

# Adjusted prompt with clearer instructions
def construct_few_shot_prompt(input_text):
    prompt = (
        "You are an expert clinician. Summarize the patient case concisely. "
        "Ensure the summary does not exceed 200 words and avoids directly copying any text.\n\n"
        "Example - Input: A 50-year-old male reports severe chest pain and has a history of heart disease.\n"
        "Example - Summary: A male patient with heart disease experienced severe chest pain and received emergency angioplasty.\n\n"
        f"Input: {input_text}\nSummary:"
    )
    return prompt

subset_df = mimic_iv_bhc_100.iloc[:101].copy()  # Processing the first 101 entries for the example
generated_summaries = []

for input_text in tqdm(subset_df['input'], desc="Generating Summaries with BioGPT-Large"):
    prompt = construct_few_shot_prompt(input_text)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1000  # Adjust max length according to your model's capabilities
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    generation_params = {
        "do_sample": False,
        "max_new_tokens": 200,
        "temperature": 0.5,
        "top_p": 0.8,
        "top_k": 20,
        "repetition_penalty": 1.2,
        "eos_token_id": tokenizer.eos_token_id
    }

    with torch.no_grad():
        original_length = inputs["input_ids"].shape[1]
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            **generation_params
        )

        # Decode only the newly generated tokens
        generated_tokens = summary_ids[0, original_length:]
        generated_summary = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

    generated_summaries.append(generated_summary)
    torch.cuda.empty_cache()  # Optional: Clear CUDA cache after each generation to free memory

# Add generated summaries to DataFrame and save
subset_df['generated_summary_biogpt_large'] = generated_summaries
subset_df.to_csv("biogpt_large_generated_summaries.csv", index=False)
print("✅ Subset summaries saved to 'biogpt_large_generated_summaries.csv'")


In [None]:
subset_df.head()

In [None]:
print(subset_df['input'].iloc[1])

In [None]:
print(subset_df['generated_summary_biogpt_large'].iloc[1])

In [None]:
from tqdm import tqdm

# Few-shot examples for prompt construction
few_shot_examples = [
    {
        "input": "<SEX> M <SERVICE> CARDIOLOGY <CHIEF COMPLAINT> chest pain <HISTORY OF PRESENT ILLNESS> A 55-year-old male presented with chest pain radiating to his left arm and jaw. He reported associated shortness of breath and nausea. Initial ECG showed ST-segment elevation in the inferior leads, and troponin levels were elevated, consistent with an acute myocardial infarction. The patient was emergently taken to the catheterization lab, where coronary angiography revealed 100% occlusion of the right coronary artery. A drug-eluting stent was successfully placed, and dual antiplatelet therapy was initiated. He was monitored in the cardiac care unit for 48 hours with no complications.",
        "Summary": "A 55-year-old male with a myocardial infarction underwent PCI with stenting of the right coronary artery. He was monitored and started on dual antiplatelet therapy."
    },
    {
        "input": "<SEX> M <SERVICE> NEUROSURGERY <CHIEF COMPLAINT> headache past ten days <HISTORY OF PRESENT ILLNESS> The patient is a 60-year-old male with a history of renal cell carcinoma, presenting with progressively worsening right-sided headaches over 10 days. Imaging revealed a large intracranial mass in the right temporoparietal lobe with associated hemorrhage, necrosis, and vasogenic edema. He was started on Decadron to reduce cerebral edema and transferred for neurosurgical evaluation. On hospital day 5, the patient underwent an image-guided right craniotomy for tumor resection. Postoperative MRI showed no significant residual tumor. He recovered without complications and was discharged home on a tapering dose of Decadron, with follow-up in the brain tumor clinic.",
        "Summary": "A 60-year-old male with an intracranial mass underwent a craniotomy for tumor resection. He recovered well and was discharged with follow-up."
    },
    {
        "input": "<SEX> F <SERVICE> GENERAL MEDICINE <CHIEF COMPLAINT> shortness of breath <HISTORY OF PRESENT ILLNESS> A 72-year-old female with a history of COPD presented with progressive shortness of breath over two weeks. Physical examination revealed decreased breath sounds and crackles in the left lower lobe. Imaging confirmed left lower lobe consolidation consistent with pneumonia. The patient was started on broad-spectrum antibiotics and oxygen therapy. Arterial blood gas analysis showed mild hypoxemia, which improved with oxygen supplementation. She remained afebrile throughout her stay and reported gradual improvement in symptoms. Discharge planning included antibiotics, a pulmonary follow-up, and instructions for home oxygen therapy.",
        "Summary": "A 72-year-old female with COPD was treated for pneumonia with antibiotics and oxygen therapy. She showed improvement and was discharged with follow-up."
    }
]

# Function to construct the prompt (unchanged)
def construct_few_shot_prompt(input_text):
    prompt = "You are a medical expert. Please summarize the following input concisely  and cohesively:\n\n. Do not include anything from the exmple."
    for example in few_shot_examples:
        prompt += f"Input: {example['input']}\nTarget: {example['target']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# Step 2: Generate Summaries with BioGPT-Large
generated_summaries = []

for input_text in tqdm(mimic_iv_bhc_100['input'], desc="Generating Summaries with BioGPT-Large"):
    # Construct the prompt
    prompt = construct_few_shot_prompt(input_text)

    # Tokenize prompt for BioGPT (causal language model)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )
    
    # Move tokenized input tensors to model's device (e.g., CUDA)
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate the summary using causal generation
    generation_params = {
        "do_sample": True,
        "top_p": 0.8,
        "temperature": 0.7,
        "top_k": 40,
        "max_new_tokens": 150,
        "repetition_penalty": 1.1
    }

    summary_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        **generation_params
    )

    # Decode and post-process the output
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summary = generated_summary.split("Summary:")[-1].strip()  # Optional cleanup
    generated_summaries.append(generated_summary)

# Step 3: Add Generated Summaries to the DataFrame
mimic_iv_bhc_100['generated_summary_biogpt_large'] = generated_summaries

# Optional: Save the results
mimic_iv_bhc_100.to_csv("biogpt_large_generated_summaries.csv", index=False)
print("Summaries generated and saved to 'biogpt_large_generated_summaries.csv'")

