Import Necessary Libraries

In [None]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [None]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'mimicivliza'  # Replace with your bucket name
mimic_iv_bhc = f's3://{bucket_name}/sample_data_100.csv'

# Load the files
mimic_iv_bhc_100 = pd.read_csv(mimic_iv_bhc)

# Display the data
mimic_iv_bhc_100.head(20)

In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available")


In [None]:
from huggingface_hub import login
import os

# Use token from environment variable (safer)
login(os.getenv("HF_TOKEN"))



In [None]:
mimic_iv_bhc_100.nunique()

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import time
import numpy as np
from tqdm import tqdm  # Import tqdm for progress tracking

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize summarization pipeline
summarizer = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generation parameters
generation_params = {
    "do_sample": True,
    "top_p": 0.7,  # Slightly more restrictive sampling
    "temperature": 0.7,  # Less randomness for focused outputs
    "top_k": 40,  # Fewer options for next token selection
    "max_new_tokens": 200,  # Limit maximum length of generated text
    "repetition_penalty": 1.1  # Discourage repeating content
}

# Few-shot examples for summarization
few_shot_examples = [
    {
        "input": "<SEX> F <SERVICE> ONCOLOGY <CHIEF COMPLAINT> worsening back pain <HISTORY OF PRESENT ILLNESS> The patient is a 45-year-old female with a history of metastatic breast cancer presenting with worsening back pain over the last two weeks. Imaging revealed compression fractures in the thoracic spine. She reported increasing discomfort despite over-the-counter pain relievers. Neurological exam was unremarkable, and there were no signs of cord compression. Pain management and radiation oncology were consulted, and palliative radiation therapy was planned. The patient also discussed advanced care planning during her stay.",
        "summary": "A 45-year-old female with metastatic breast cancer presented with worsening back pain. Imaging showed thoracic spine fractures, and she received palliative radiation therapy."
    },
    {
        "input": "<SEX> M <SERVICE> CARDIOLOGY <CHIEF COMPLAINT> shortness of breath <HISTORY OF PRESENT ILLNESS> A 60-year-old male with a history of hypertension and diabetes presented with progressive shortness of breath over the past month. Echocardiogram showed reduced ejection fraction. The patient was started on diuretics and beta-blockers. Cardiology team planned for close outpatient follow-up.",
        "summary": ""  # Leave empty for model to infer
    }
]

# Construct few-shot prompt
def construct_prompt(input_text):
    prompt = "Summarize the following clinical note. Do not repeat input or prompt.\n\n"
    for example in few_shot_examples:
        prompt += f"Input: {example['input']}\nSummary: {example['summary']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# Lists to store latency and throughput measurements
latencies = []
throughputs = []

# Summarization function with timing
def summarize_text(text):
    prompt = construct_prompt(text)
    
    # Count input tokens
    input_tokens = len(tokenizer.encode(prompt))
    
    # Measure time
    start_time = time.time()
    output = summarizer(prompt, **generation_params)[0]["generated_text"]
    end_time = time.time()
    
    # Calculate latency in milliseconds
    latency = (end_time - start_time) * 1000
    
    # Count output tokens (new tokens generated)
    generated_text = output.split("Summary:")[-1].strip()
    output_tokens = len(tokenizer.encode(generated_text))
    
    # Calculate throughput (tokens per second)
    throughput = output_tokens / (latency / 1000)
    
    # Store measurements
    latencies.append(latency)
    throughputs.append(throughput)
    
    return generated_text

# Apply summarization with progress bar
tqdm.pandas(desc="Summarizing clinical notes")  # Enable tqdm for pandas
mimic_iv_bhc_100["generated_summary"] = mimic_iv_bhc_100["input"].progress_apply(summarize_text)

# Calculate and print statistics
mean_latency = np.mean(latencies)
std_latency = np.std(latencies)
mean_throughput = np.mean(throughputs)
std_throughput = np.std(throughputs)

print("\n--- Performance Metrics ---")
print(f"Throughput: {mean_throughput:.2f} ± {std_throughput:.2f} tokens/sec")
print(f"Latency: {mean_latency:.2f} ± {std_latency:.2f} ms")
print(f"\nFor Table:")
print(f"${mean_throughput:.2f} \\pm {std_throughput:.2f}$ & ${mean_latency:.2f} \\pm {std_latency:.2f}$ \\\\")

# Save results
mimic_iv_bhc_100["generated_summary"] = mimic_iv_bhc_100["input"].progress_apply(summarize_text)
mimic_iv_bhc_100.to_csv("generated_summaries_100.csv", index=False)
print("\nSummarization complete. Results saved to 'generated_summaries_100.csv'.")

# Optional: Save metrics to a separate file
with open("performance_metrics.txt", "w") as f:
    f.write(f"Model: {model_name}\n")
    f.write(f"Parameters: Token=200, Temp=0.7\n")
    f.write(f"Throughput: {mean_throughput:.2f} ± {std_throughput:.2f} tokens/sec\n")
    f.write(f"Latency: {mean_latency:.2f} ± {std_latency:.2f} ms\n")
    f.write(f"Table format: ${mean_throughput:.2f} \\pm {std_throughput:.2f}$ & ${mean_latency:.2f} \\pm {std_latency:.2f}$ \\\\\n")

In [None]:
mimic_iv_bhc_100.head(100)

In [None]:
print(mimic_iv_bhc_100['input'].iloc[10])

In [None]:
print(mimic_iv_bhc_100['generated_summary'].iloc[10])

In [None]:
from tqdm import tqdm
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Set environment variable for better memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16  # Use FP16 for reduced memory
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Fix the padding warning
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize the pipeline
summarizer = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# Few-shot examples
few_shot_examples = [
    {
        "input": "<SEX> F <SERVICE> ONCOLOGY <CHIEF COMPLAINT> worsening back pain <HISTORY OF PRESENT ILLNESS> The patient is a 45-year-old female with a history of metastatic breast cancer presenting with worsening back pain over the last two weeks. Imaging revealed compression fractures in the thoracic spine. She reported increasing discomfort despite over-the-counter pain relievers. Neurological exam was unremarkable, and there were no signs of cord compression. Pain management and radiation oncology were consulted, and palliative radiation therapy was planned. The patient also discussed advanced care planning during her stay.",
        "target": "A 45-year-old female with metastatic breast cancer presented with worsening back pain. Imaging showed thoracic spine fractures, and she received palliative radiation therapy."
    }

]


# Combine few-shot examples into a prompt template
def construct_few_shot_prompt(input_text):
    prompt = "You are a medical expert. Summarize. Do not include prompt/Input in your generated summary"
    for example in few_shot_examples:
        prompt += f"Input: {example['input']}\nTarget: {example['target']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# Parameters
batch_size = 8

generation_params = {
    "do_sample": True,
    "top_p": 0.8,  # Slightly more restrictive sampling
    "temperature": 0.7,  # Less randomness for focused outputs
    "top_k": 40,  # Fewer options for next token selection
    "max_new_tokens": 40,  # Limit maximum length of generated text
    "repetition_penalty": 1.1  # Discourage repeating content
}

# Prepare inputs
inputs = mimic_iv_bhc_100["input"].tolist()
generated_summaries = []

def clean_generated_text(generated_text):
    # Extract only the generated summary part
    return generated_text.strip()

# Process a batch of inputs
def process_batch(batch):
    try:
        # Construct few-shot prompts for the batch
        prompts = [construct_few_shot_prompt(input_text) for input_text in batch]
        
        # Generate outputs
        outputs = summarizer(
            prompts,
            **generation_params,
        )
        return outputs if isinstance(outputs, list) else [outputs]
    except RuntimeError as e:
        if "out of memory" in str(e):
            torch.cuda.empty_cache()
            print(f"OOM error, retrying with batch size {len(batch)//2}")
            if len(batch) > 1:
                half = len(batch) // 2
                return (
                    process_batch(batch[:half]) +
                    process_batch(batch[half:])
                )
        raise e

# Process with progress bar
with tqdm(total=len(inputs), desc="Processing Rows", unit="row") as pbar:
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i + batch_size]
        
        # Process batch
        summaries = process_batch(batch)
        
        # Extract and clean generated text dynamically
        if isinstance(summaries[0], dict) and 'generated_text' in summaries[0]:
            generated_texts = [clean_generated_text(summary['generated_text']) for summary in summaries]
        elif isinstance(summaries[0], list) and isinstance(summaries[0][0], dict) and 'generated_text' in summaries[0][0]:
            generated_texts = [clean_generated_text(summary[0]['generated_text']) for summary in summaries]
        elif isinstance(summaries[0], str):
            generated_texts = [clean_generated_text(summary) for summary in summaries]
        else:
            raise ValueError(f"Unexpected format for summaries.")
        
        # Append cleaned summaries to results
        generated_summaries.extend(generated_texts)
        
        # Clear cache
        torch.cuda.empty_cache()
        
        # Update progress
        pbar.update(len(batch))

# Add generated summaries to DataFrame and save
mimic_iv_bhc_100["generated_summary"] = generated_summaries
mimic_iv_bhc_100.to_csv("generated_summaries_100.csv", index=False)
print("Summaries saved to 'generated_summaries_100.csv'")

### Code with computational efficiency metrices

In [None]:
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import time

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize summarization pipeline
summarizer = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generation parameters
generation_params = {
    "do_sample": True,
    "top_p": 0.8,  # Slightly more restrictive sampling
    "temperature": 0.9,  # Less randomness for focused outputs
    "top_k": 40,  # Fewer options for next token selection
    "max_new_tokens": 300,  # Limit maximum length of generated text
    "repetition_penalty": 1.1  # Discourage repeating content
}

# Data inputs
inputs = mimic_iv_bhc_100["input"].tolist()
generated_summaries = []

# Metrics tracking
total_tokens_generated = 0
total_input_tokens = 0
total_time_spent = 0
time_to_first_token = []
throughput_list = []

# Few-shot prompt constructor
def construct_few_shot_prompt(input_text):
    prompt = "You are a medical expert. Please summarize the following input concisely:\n\n"
    for example in few_shot_examples:
        prompt += f"Input: {example['input']}\nTarget: {example['target']}\n\n"
    prompt += f"Input: {input_text}\nSummary:"
    return prompt

# Process a batch of inputs
def process_batch(batch):
    prompts = [construct_few_shot_prompt(input_text) for input_text in batch]
    batch_input_tokens = sum(len(tokenizer.encode(prompt)) for prompt in prompts)

    batch_start_time = time.time()  # Start timing for the batch
    outputs = []

    # Generate outputs with timing for TTFT
    for prompt in prompts:
        single_start_time = time.time()  # Start timing for TTFT
        output = summarizer(prompt, **generation_params)
        single_end_time = time.time()  # End timing for TTFT

        ttft = single_end_time - single_start_time  # Time to first token
        ttft_list.append(ttft)  # Store TTFT for this input
        outputs.append(output)

    batch_end_time = time.time()  # End timing for the batch
    batch_time = batch_end_time - batch_start_time

    # Flatten nested outputs if necessary
    if isinstance(outputs[0], list):
        outputs = [item for sublist in outputs for item in sublist]

    # Extract summaries and count output tokens
    if isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
        summaries = [output['generated_text'] for output in outputs]
        batch_output_tokens = sum(len(tokenizer.encode(summary)) for summary in summaries)
    else:
        raise ValueError(f"Unexpected format for outputs: {outputs}")

    # Update metrics
    global total_tokens_generated, total_input_tokens, total_time_spent, total_summaries
    total_tokens_generated += batch_output_tokens
    total_input_tokens += batch_input_tokens
    total_time_spent += batch_time
    total_summaries += len(batch)  # Count the number of summaries processed
    time_per_input.append(batch_time / len(batch))  # Latency per input (full summary time)
    throughput_list.append((batch_input_tokens + batch_output_tokens) / batch_time)  # Tokens/second

    return summaries

# Initialize metrics
total_summaries = 0
time_per_input = []  # Latency per summary
ttft_list = []  # Time to first token
throughput_list = []

# Process inputs in batches
batch_size = 8
with tqdm(total=len(inputs), desc="Processing Rows", unit="row") as pbar:
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i + batch_size]
        generated_summaries.extend(process_batch(batch))
        pbar.update(len(batch))

# Metrics Calculation
average_latency = sum(time_per_input) / len(time_per_input)  # Average time per summary
average_ttft = sum(ttft_list) / len(ttft_list)  # Average TTFT
average_throughput = sum(throughput_list) / len(throughput_list)  # Average tokens per second
token_efficiency = total_tokens_generated / total_input_tokens  # Output/Input token ratio

# Print metrics
print(f"\nComputational Efficiency Metrics:")
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_tokens_generated}")
print(f"Total Time Spent: {total_time_spent:.2f} seconds")
print(f"Average Latency (Time per Summary): {average_latency:.4f} seconds")
print(f"Average TTFT (Time to First Token): {average_ttft:.4f} seconds")
print(f"Average Throughput: {average_throughput:.2f} tokens/second")
print(f"Token Efficiency (TE): {token_efficiency:.4f}")

# Save results
mimic_iv_bhc_100["generated_summary"] = generated_summaries
mimic_iv_bhc_100.to_csv("generated_summaries_with_metrics.csv", index=False)
print("\nSummaries saved to 'generated_summaries_with_metrics.csv'")


In [None]:
pd.set_option('display.max_colwidth', None)  # Prevent truncation of long text
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping of content

print(mimic_iv_bhc_100['generated_summary'].iloc[9])  # Replace 9 with the desired row index
  # Remember: Index starts from 0