In [20]:
import pandas as pd
reduced = pd.read_csv("soap_processed_reduced_texts.csv")
# Display the first few rows
print(reduced.head())

# Check DataFrame info
print(reduced.info())

                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  \
0  Subjective:\n- Symptoms: Lower back pain, radi...   
1  Subjective:\n- Presenting with dry cough for 1...   
2  Subjective:\n- No known allergies to medicatio...   
3  Subjective:\n- Fever and dry cough started 4 d...   
4  Subjective:\n- Presenting with chest pain for ...   

                                        cptf_summary  \
0  <|begin_of_text|>Good afternoon, champ, how yo...   
1  <|begin_of_text|>What brings you in here today...   
2  <|begin_of_text|>Do you have any known allergi...   
3  <|begin_of_text|>How may I help you today? Yea...   
4  <|begin_of_text|>It sounds like that you're

In [21]:
# Select only the 'input' and 'reduced_text' columns
selected_columns = reduced[['input', 'reduced_text', 'output']]

# Optional: Filter out rows where 'reduced_text' is empty or contains only whitespace
filtered_columns = selected_columns[selected_columns['reduced_text'].str.strip() != ""]

# Display the filtered DataFrame
print(filtered_columns.head())


                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                        reduced_text  \
0  <|begin_of_text|>Good afternoon, champ, how yo...   
1  <|begin_of_text|>What brings you in here today...   
2  <|begin_of_text|>Do you have any known allergi...   
3  <|begin_of_text|>How may I help you today? Yea...   
4  <|begin_of_text|>It sounds like that you're ex...   

                                              output  
0  Subjective:\n- Symptoms: Lower back pain, radi...  
1  Subjective:\n- Presenting with dry cough for 1...  
2  Subjective:\n- No known allergies to medicatio...  
3  Subjective:\n- Fever and dry cough started 4 d...  
4  Subjective:\n- Presenting with chest pain for ..

In [22]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [5]:
import os
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd

# -----------------------------------------------------
# 1. Environment setup (optional but often helpful)
# -----------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# -----------------------------------------------------
# 2. Load your model and tokenizer
# -----------------------------------------------------
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Example model name
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",           # Automatic GPU/CPU placement
    torch_dtype=torch.float16     # Use FP16 for reduced memory usage
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# -----------------------------------------------------
# 3. Define a prompt construction function
# -----------------------------------------------------
def construct_prompt(input_text):
    """
    Constructs an instruction-based prompt for summarization.
    """
    prompt = (
        "You are a medical expert. Summarize the following case in an excellent manner. "
        "Do not include any extra or verbatim text from the input. "
        f"Case:\n{input_text}\n\nSummary:"
    )
    return prompt

# -----------------------------------------------------
# 4. Set your generation parameters
# -----------------------------------------------------
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.2,
    "top_k": 20,
    "max_new_tokens": 300,
    "repetition_penalty": 1.1,
    "eos_token_id": tokenizer.eos_token_id
}

# -----------------------------------------------------
# 5. Load your sample DataFrame (df_sample) with columns "input" and "output"
# -----------------------------------------------------
df_sample = filtered_columns  # This is now your DataFrame with filtered columns

# -----------------------------------------------------
# 6. Summarize your df_sample DataFrame using partial decoding
# -----------------------------------------------------
batch_size = 8  # Adjust as needed
inputs_list = df_sample["reduced_text"].tolist()  # Using the reduced_text column
generated_summaries = []

def process_batch(batch_inputs):
    batch_generated = []
    for text in batch_inputs:
        prompt = construct_prompt(text)
        # Tokenize prompt
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1000)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        prompt_length = inputs["input_ids"].shape[1]

        # Generate output tokens
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            **generation_params
        )
        # Slice out only the tokens that were generated after the prompt
        generated_tokens = summary_ids[0, prompt_length:]
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        batch_generated.append(generated_text)
    return batch_generated

# -----------------------------------------------------
# 7. Process the DataFrame in batches with a progress bar
# -----------------------------------------------------
with tqdm(total=len(inputs_list), desc="Generating Summaries", unit="row") as pbar:
    for i in range(0, len(inputs_list), batch_size):
        batch = inputs_list[i:i + batch_size]
        try:
            batch_generated = process_batch(batch)
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                print("Out of memory error; try reducing batch size.")
            raise e
        generated_summaries.extend(batch_generated)
        torch.cuda.empty_cache()
        pbar.update(len(batch))

# -----------------------------------------------------
# 8. Store and Save
# -----------------------------------------------------
# Add generated summaries as a new column in df_sample
df_sample["generated_summary"] = generated_summaries

# Save the DataFrame with input, reduced_text, and generated_summary
df_sample.to_csv("soap_cptf_generated_summaries.csv", index=False)
print("Summaries saved to 'soap_cptf_generated_summaries.csv'")

Generating Summaries:   0%|          | 0/100 [00:00<?, ?row/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Generating Summaries:   8%|▊         | 8/100 [00:43<08:19,  5.43s/row]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

Summaries saved to 'soap_cptf_generated_summaries.csv'





In [None]:
import os
import time
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# -----------------------------------------------------
# 1. Environment setup (optional but often helpful)
# -----------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# -----------------------------------------------------
# 2. Load your model and tokenizer
# -----------------------------------------------------
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Example model name
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",            # Automatic GPU/CPU placement
    torch_dtype=torch.float16     # Use FP16 for reduced memory usage
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Model and tokenizer loaded successfully: {model_name}")

# -----------------------------------------------------
# 3. Define a prompt construction function
# -----------------------------------------------------
def construct_prompt(input_text):
    """
    Constructs an instruction-based prompt for summarization.
    """
    prompt = (
        "You are a world class medical expert. Summarize the following case in an excellent manner. "
        "Do not include any extra or verbatim text from the input. "
        f"Case:\n{input_text}\n\nSummary:"
    )
    return prompt

# -----------------------------------------------------
# 4. Set your generation parameters
# -----------------------------------------------------
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.2,
    "top_k": 20,
    "max_new_tokens": 300,
    "repetition_penalty": 1.1,
    "eos_token_id": tokenizer.eos_token_id
}

# -----------------------------------------------------
# 5. Load your sample DataFrame (df_sample) with columns "input" and "reduced_text"
# -----------------------------------------------------
df_sample = filtered_columns  # Assumes 'filtered_columns' is already defined
batch_size = 8
inputs_list = df_sample["reduced_text"].tolist()
generated_summaries = []

# -----------------------------------------------------
# 6. Define batch processing function
# -----------------------------------------------------
def process_batch(batch_inputs):
    batch_generated = []
    for text in batch_inputs:
        prompt = construct_prompt(text)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1000)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        prompt_length = inputs["input_ids"].shape[1]
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            **generation_params
        )
        generated_tokens = summary_ids[0, prompt_length:]
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        batch_generated.append(generated_text)
    return batch_generated

# -----------------------------------------------------
# 7. Generate summaries with performance measurement
# -----------------------------------------------------
start_time = time.time()

with tqdm(total=len(inputs_list), desc="Generating Summaries", unit="row") as pbar:
    for i in range(0, len(inputs_list), batch_size):
        batch = inputs_list[i:i + batch_size]
        try:
            batch_generated = process_batch(batch)
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                print("Out of memory error; try reducing batch size.")
            raise e
        generated_summaries.extend(batch_generated)
        torch.cuda.empty_cache()
        pbar.update(len(batch))

end_time = time.time()
total_time = end_time - start_time
total_samples = len(inputs_list)
throughput = total_samples / total_time
latency = total_time / total_samples

print(f"\n--- Inference Performance Metrics ---")
print(f"Total examples processed: {total_samples}")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Average latency per example: {latency:.4f} seconds")
print(f"Throughput: {throughput:.2f} examples/second")

# -----------------------------------------------------
# 8. Store and Save
# -----------------------------------------------------
df_sample["generated_summary"] = generated_summaries
df_sample.to_csv("soap_cptf_generated_summaries.csv", index=False)
print("Summaries saved to 'soap_cptf_generated_summaries.csv'")


Model and tokenizer loaded successfully: meta-llama/Llama-3.2-1B-Instruct


Generating Summaries:   0%|          | 0/100 [00:00<?, ?row/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Generating Summaries:   8%|▊         | 8/100 [2:46:49<31:58:32, 1251.22s/row]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-e

### Efficiency Measurement

In [24]:
import os
import torch
import time
import numpy as np
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd

# -----------------------------------------------------
# 1. Environment setup (optional but often helpful)
# -----------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [25]:
# -----------------------------------------------------
# 2. Load your model and tokenizer
# -----------------------------------------------------
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Example model name
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",           # Automatic GPU/CPU placement
    torch_dtype=torch.float16     # Use FP16 for reduced memory usage
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# Print confirmation that model and tokenizer are loaded
print(f"Model and tokenizer loaded successfully: {model_name}")




Model and tokenizer loaded successfully: meta-llama/Llama-3.2-1B-Instruct


In [27]:
# -----------------------------------------------------
# 3. Define a prompt construction function
# -----------------------------------------------------
def construct_prompt(input_text):
    """
    Constructs an instruction-based prompt for summarization.
    """
    prompt = (
        "You are a world class medical expert. Summarize the following case in an excellent manner. "
        "Do not include any extra or verbatim text from the input. "
        f"Case:\n{input_text}\n\nSummary:"
    )
    return prompt

In [28]:
# -----------------------------------------------------
# 4. Set your generation parameters
# -----------------------------------------------------
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.2,
    "top_k": 20,
    "max_new_tokens": 300,
    "repetition_penalty": 1.1,
    "eos_token_id": tokenizer.eos_token_id
}


In [29]:
# -----------------------------------------------------
# 5. Load your sample DataFrame (df_sample) with columns "input" and "output"
# -----------------------------------------------------
df_sample = filtered_columns  # This is now your DataFrame with filtered columns

In [30]:
# -----------------------------------------------------
# 6. Initialize latency and throughput tracking
# -----------------------------------------------------
latency_list = []
throughput_list = []

# -----------------------------------------------------
# 7. Summarize your df_sample DataFrame using partial decoding
# -----------------------------------------------------
batch_size = 4  # Reduced from 8 to avoid memory issues
inputs_list = df_sample["reduced_text"].tolist()
generated_summaries = []

In [31]:
def process_batch(batch_inputs):
    batch_generated = []
    
    for text in batch_inputs:
        prompt = construct_prompt(text)
        
        # Tokenize prompt
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1000)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        prompt_length = inputs["input_ids"].shape[1]
        input_token_count = prompt_length
        
        # Start timing
        start_time = time.time()
        
        # Generate output tokens
        summary_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            **generation_params
        )
        
        # End timing
        end_time = time.time()
        
        # Calculate metrics
        generated_tokens = summary_ids[0, prompt_length:]
        output_token_count = len(generated_tokens)
        total_token_count = input_token_count + output_token_count
        
        # Latency in seconds
        latency = end_time - start_time
        latency_list.append(latency)
        
        # Throughput in tokens per second
        throughput = total_token_count / latency if latency > 0 else 0
        throughput_list.append(throughput)
        
        # Decode the generated text
        generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        batch_generated.append(generated_text)
        
        # Print progress for each item to ensure script is running
        print(f"Processed item: Latency={latency:.2f}s, Throughput={throughput:.2f} tokens/s")
        
    return batch_generated

In [32]:
# -----------------------------------------------------
# 8. Process the DataFrame in batches with a progress bar
# -----------------------------------------------------
print(f"Starting to process {len(inputs_list)} items with batch size {batch_size}")

with tqdm(total=len(inputs_list), desc="Generating Summaries", unit="row") as pbar:
    for i in range(0, len(inputs_list), batch_size):
        batch = inputs_list[i:i + batch_size]
        batch_size_actual = len(batch)
        print(f"Processing batch {i//batch_size + 1}/{(len(inputs_list) + batch_size - 1)//batch_size}, size={batch_size_actual}")
        
        try:
            batch_generated = process_batch(batch)
            generated_summaries.extend(batch_generated)
            print(f"Batch {i//batch_size + 1} completed successfully")
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                print(f"Out of memory error in batch {i//batch_size + 1}; try reducing batch size.")
                # Try again with smaller batch
                for single_text in batch:
                    try:
                        result = process_batch([single_text])
                        generated_summaries.extend(result)
                        print("Processed single item successfully after batch failure")
                    except Exception as inner_e:
                        print(f"Error processing single item: {inner_e}")
                        generated_summaries.append("Error generating summary")
            else:
                print(f"Error in batch {i//batch_size + 1}: {e}")
                # Add placeholders for failed batch
                generated_summaries.extend(["Error generating summary"] * len(batch))
        
        torch.cuda.empty_cache()
        pbar.update(batch_size_actual)
        print(f"Progress: {min(i + batch_size, len(inputs_list))}/{len(inputs_list)} items processed")

# -----------------------------------------------------
# 9. Calculate and print metrics
# -----------------------------------------------------
if latency_list:
    mean_latency = np.mean(latency_list)
    std_latency = np.std(latency_list)
    mean_throughput = np.mean(throughput_list)
    std_throughput = np.std(throughput_list)

    # Print metrics
    print("\nComputational Efficiency Metrics:")
    print(f"Average Latency (Time per Summary): {mean_latency:.2f} ± {std_latency:.2f} seconds")
    print(f"Average Throughput: {mean_throughput:.2f} ± {std_throughput:.2f} tokens/second")

    # Print formatted for LaTeX table
    print("\nFor LaTeX Table:")
    print(f"${mean_throughput:.2f} \\pm {std_throughput:.2f}$ & ${mean_latency:.2f} \\pm {std_latency:.2f}$ \\\\")

    # Save metrics to a separate file
    with open("cptf_soap_efficiency_metrics.txt", "w") as f:
        f.write(f"Model: {model_name} with CPTF for SOAP\n")
        f.write(f"Parameters: Token=300, Temp=0.2\n")
        f.write(f"Throughput: {mean_throughput:.2f} ± {std_throughput:.2f} tokens/second\n")
        f.write(f"Latency: {mean_latency:.2f} ± {std_latency:.2f} seconds\n")
        f.write(f"\nFor LaTeX Table:\n")
        f.write(f"${mean_throughput:.2f} \\pm {std_throughput:.2f}$ & ${mean_latency:.2f} \\pm {std_latency:.2f}$ \\\\\n")
else:
    print("No metrics were collected.")

# -----------------------------------------------------
# 10. Store and Save
# -----------------------------------------------------
# Make sure the lengths match
if len(generated_summaries) < len(df_sample):
    print(f"Warning: Generated {len(generated_summaries)} summaries but dataframe has {len(df_sample)} rows")
    # Pad with error messages if needed
    generated_summaries.extend(["Error generating summary"] * (len(df_sample) - len(generated_summaries)))
elif len(generated_summaries) > len(df_sample):
    print(f"Warning: Generated {len(generated_summaries)} summaries but dataframe has {len(df_sample)} rows")
    generated_summaries = generated_summaries[:len(df_sample)]

# Add generated summaries as a new column in df_sample
df_sample["generated_summary"] = generated_summaries

# Save the DataFrame with input, reduced_text, and generated_summary
df_sample.to_csv("soap_cptf_generated_summaries.csv", index=False)
print("Summaries saved to 'soap_cptf_generated_summaries.csv'")

Starting to process 100 items with batch size 4


Generating Summaries:   0%|          | 0/100 [00:00<?, ?row/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Processing batch 1/25, size=4


Generating Summaries:   0%|          | 0/100 [11:27<?, ?row/s]


KeyboardInterrupt: 