In [4]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [5]:
import pandas as pd

# Load the file
processed_df = pd.read_csv("processed_reduced_texts.csv")

# Display the first 5 rows
print(processed_df.head(5))


          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   
2  12435705-DS-14  <SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...   
3   12413577-DS-4  <SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...   
4  17967161-DS-29  <SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...   

                                              target  input_tokens  \
0  This is a ___ yo F admitted to the hospital af...          1195   
1  Mr. ___ is a ___ yo man with CAD with prior MI...          3496   
2  Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...          5591   
3  On ___, Ms. ___ was admitted to the gynecology...          1119   
4  Mr. ___ underwent an angiogram on ___ which sh...          3307   

   target_tokens                                       reduced_text  \
0             75  <|begin_of_text|><SEX> F <SERVICE> SURGERY <AL...   
1   

In [6]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available")


GPU is available: Tesla V100-SXM2-16GB


In [7]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [8]:
import torch
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import os

In [9]:
# Set environment variable for better memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Configure quantization (8-bit)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)
print("Environment setup and quantization configuration done.")


Environment setup and quantization configuration done.


In [10]:
from tqdm import tqdm
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Set environment variable for better memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print("Loading model and tokenizer...")
with tqdm(total=2, desc="Initializing Model and Tokenizer", unit="step") as pbar:
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16       
    )
    pbar.update(1)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token_id = tokenizer.eos_token_id
    pbar.update(1)
print("Model and tokenizer loaded successfully.")

Loading model and tokenizer...


Initializing Model and Tokenizer:   0%|          | 0/2 [00:00<?, ?step/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Initializing Model and Tokenizer:  50%|█████     | 1/2 [01:12<01:12, 72.00s/step]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Initializing Model and Tokenizer: 100%|██████████| 2/2 [01:13<00:00, 36.72s/step]

Model and tokenizer loaded successfully.





In [11]:
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)
print("Summarization pipeline initialized.")

Device set to use cuda:0


Summarization pipeline initialized.


In [21]:
from tqdm import tqdm
import time

# ✅ Your Few-Shot Prompt (Updated)
FEW_SHOT_EXAMPLES = """
You are an EXPERT AT WRITING CLINICAL SUMMARY. Summarize the input text in a very cohesive manner. Maintain storytelling style. 
Example 1:
Input: 45-year-old male with diabetes presented with chest pain and shortness of breath. ECG showed myocardial infarction. Patient treated with aspirin and admitted to cardiac unit.
Summary: 45-year-old male with diabetes, chest pain, MI confirmed by ECG, treated with aspirin, admitted.

Example 2:
Input: 72-year-old female with history of hypertension and stroke admitted with slurred speech, left-sided weakness. MRI confirmed acute ischemic stroke. Started on anticoagulation and monitored in ICU.
Summary: 72-year-old female with hypertension, stroke, slurred speech, left-sided weakness, ischemic stroke confirmed by MRI, started on anticoagulation.

Example 3:
Input: 60-year-old female with chronic kidney disease, admitted for worsening kidney function. Lab tests showed elevated creatinine. Dialysis started.
Summary:
 
Now summarize the following clinical note using only provided input. Do not add anything from the prompt. If unsure, say 'UNKNOWN':
"""


# ✅ Generation parameters (Fixed)
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.9,
    "top_k": 80,
    "max_new_tokens": 300,
    "repetition_penalty": 1.1,
    "use_cache": True,
    "pad_token_id": tokenizer.eos_token_id,  # Ensures valid padding
}

batch_size = 8  # Adjust based on memory

# ✅ Metrics tracking
total_input_tokens = 0
total_output_tokens = 0
total_time_spent = 0
ttft_list = []  # Time to first token
latency_list = []  # Time per summary
throughput_list = []  # Tokens processed per second

# ✅ Function to Measure TTFT (NO Streaming Output)
def measure_ttft(prompt, model, tokenizer):
    start_time = time.time() * 1000  # Convert to milliseconds
    first_token_time = None

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    # ✅ Generate output (NO STREAMER → No real-time output)
    full_output = model.generate(
        **inputs,
        **generation_params  # ✅ No streamer, generates silently
    )

    # Capture TTFT
    first_token_time = time.time() * 1000
    ttft = first_token_time - start_time

    # Decode full response
    generated_text = tokenizer.decode(full_output[0], skip_special_tokens=True).strip()
    return ttft, {"generated_text": generated_text}

# ✅ Batch Processing (WITH `tqdm` Only)
print("Generating summaries in batches...")
generated_summaries = []

for i in tqdm(range(0, len(processed_df), batch_size), desc="Processing Batches"):
    batch = processed_df["reduced_text"][i:i + batch_size].tolist()

    try:
        # ✅ Keep Few-Shot Prompt
        prompts = [f"{FEW_SHOT_EXAMPLES}\n{text}" for text in batch]
        batch_input_tokens = sum(len(tokenizer.encode(prompt)) for prompt in prompts)

        batch_start_time = time.time()
        summaries = []

        for prompt in prompts:
            single_start_time = time.time()

            # ✅ Get TTFT and Full Response (NO Printing Each Output)
            ttft, output = measure_ttft(prompt, model, tokenizer)
            ttft_list.append(ttft)

            # ✅ Store Summary
            summaries.append(output["generated_text"].strip())

        batch_end_time = time.time()

        # ✅ Compute Metrics
        batch_output_tokens = sum(len(tokenizer.encode(summary)) for summary in summaries)
        batch_latency = batch_end_time - batch_start_time  # Total batch time
        latency_list.append(batch_latency / len(batch))  # Average latency per summary
        throughput_list.append((batch_input_tokens + batch_output_tokens) / batch_latency)

        # ✅ Update Global Metrics
        total_input_tokens += batch_input_tokens
        total_output_tokens += batch_output_tokens
        total_time_spent += batch_latency

        # ✅ Store Summaries
        generated_summaries.extend(summaries)

    except Exception as e:
        print(f"Error generating summaries for batch starting at index {i}: {e}")
        generated_summaries.extend([""] * len(batch))  # Fill with empty summaries if failed

# ✅ Add Summaries to DataFrame
processed_df["generated_summary"] = generated_summaries

# ✅ Compute Final Metrics
average_latency = sum(latency_list) / len(latency_list) if latency_list else 0
average_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0  # Now in milliseconds
average_throughput = sum(throughput_list) / len(throughput_list) if throughput_list else 0
token_efficiency = total_output_tokens / total_input_tokens if total_input_tokens else 0

# ✅ Print Final Metrics (Only Once)
print("\nComputational Efficiency Metrics:")
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_output_tokens}")
print(f"Total Time Spent: {total_time_spent:.2f} seconds")
print(f"Average Latency (Time per Summary): {average_latency:.4f} seconds")
print(f"Average TTFT (Time to First Token): {average_ttft:.2f} ms")  # Display in milliseconds
print(f"Average Throughput: {average_throughput:.2f} tokens/second")
print(f"Token Efficiency (TE): {token_efficiency:.4f}")

# ✅ Save Results
processed_df.to_csv("FLP_fixed_ttft_fewshot_summaries.csv", index=False)
print("\nSummaries saved to 'FLP_fixed_ttft_fewshot_summaries.csv'")


Generating summaries in batches...


Processing Batches: 100%|██████████| 13/13 [22:17<00:00, 102.87s/it]


Computational Efficiency Metrics:
Total Input Tokens: 153085
Total Output Tokens: 173773
Total Time Spent: 1336.44 seconds
Average Latency (Time per Summary): 13.3032 seconds
Average TTFT (Time to First Token): 13361.37 ms
Average Throughput: 258.51 tokens/second
Token Efficiency (TE): 1.1351

Summaries saved to 'FLP_fixed_ttft_fewshot_summaries.csv'





In [None]:
from tqdm import tqdm
import time

# ✅ YOUR ORIGINAL FEW-SHOT PROMPT (RESTORED)
FEW_SHOT_EXAMPLES = """
You are an EXPERT AT WRITING CLINICAL SUMMARY. Summarize the input text in a very cohesive manner. Maintain storytelling style. 
Example 1:
Input: 45-year-old male with diabetes presented with chest pain and shortness of breath. ECG showed myocardial infarction. Patient treated with aspirin and admitted to cardiac unit.
Summary: 45-year-old male with diabetes, chest pain, MI confirmed by ECG, treated with aspirin, admitted.

Example 2:
Input: 72-year-old female with history of hypertension and stroke admitted with slurred speech, left-sided weakness. MRI confirmed acute ischemic stroke. Started on anticoagulation and monitored in ICU.
Summary: 72-year-old female with hypertension, stroke, slurred speech, left-sided weakness, ischemic stroke confirmed by MRI, started on anticoagulation.

Example 3 (Correcting Hallucination):
Input: 60-year-old female with chronic kidney disease, admitted for worsening kidney function. Lab tests showed elevated creatinine. Dialysis started.
Summary: 60-year-old female with kidney disease, worsening function, elevated creatinine, started on dialysis.

Now summarize the following clinical note using only its information. Do not add or infer anything. If unsure, say 'UNKNOWN':
"""

# ✅ GENERATION PARAMETERS
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.6,
    "top_k": 40,
    "max_new_tokens": 150,
    "repetition_penalty": 1.2,
    "stream": True  # Enables real-time token streaming for TTFT
}

batch_size = 4  # Balanced batch size for 8-bit models

# ✅ METRICS TRACKING
total_input_tokens = 0
total_output_tokens = 0
total_time_spent = 0
ttft_list = []  # Time to first token
latency_list = []  # Time per summary
throughput_list = []  # Tokens processed per second

# ✅ FUNCTION TO MEASURE TTFT USING STREAMING INFERENCE
def measure_ttft(prompt):
    start_time = time.time()
    
    # Stream the response and capture first token
    for response in summarizer(prompt, **generation_params):
        first_token_time = time.time()
        return first_token_time - start_time, response  # Return TTFT & first token

# ✅ BATCH PROCESSING
print("Generating summaries in batches...")
generated_summaries = []

for i in tqdm(range(0, len(processed_df), batch_size), desc="Processing Batches"):
    batch = processed_df["reduced_text"][i:i + batch_size].tolist()

    try:
        # ✅ KEEP YOUR FEW-SHOT PROMPT UNCHANGED
        prompts = [f"{FEW_SHOT_EXAMPLES}\n{text}" for text in batch]
        batch_input_tokens = sum(len(tokenizer.encode(prompt)) for prompt in prompts)

        # Measure batch start time
        batch_start_time = time.time()
        summaries = []

        for prompt in prompts:
            single_start_time = time.time()

            # ✅ GET TTFT AND FIRST TOKEN VIA STREAMING
            ttft, output = measure_ttft(prompt)
            ttft_list.append(ttft)

            # ✅ CONTINUE GENERATING REMAINING TOKENS
            generated_text = output[0]["generated_text"].strip()
            summaries.append(generated_text)

        batch_end_time = time.time()

        # ✅ COMPUTE METRICS
        batch_output_tokens = sum(len(tokenizer.encode(summary)) for summary in summaries)
        batch_latency = batch_end_time - batch_start_time  # Total batch time
        latency_list.append(batch_latency / len(batch))  # Average latency per summary
        throughput_list.append((batch_input_tokens + batch_output_tokens) / batch_latency)

        # ✅ UPDATE GLOBAL METRICS
        total_input_tokens += batch_input_tokens
        total_output_tokens += batch_output_tokens
        total_time_spent += batch_latency

        # ✅ STORE SUMMARIES
        generated_summaries.extend(summaries)

    except Exception as e:
        print(f"Error generating summaries for batch starting at index {i}: {e}")
        generated_summaries.extend([""] * len(batch))  # Fill with empty summaries in case of failure

# ✅ ADD SUMMARIES TO DATAFRAME
processed_df["generated_summary"] = generated_summaries

# ✅ METRICS CALCULATION
average_latency = sum(latency_list) / len(latency_list) if latency_list else 0
average_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
average_throughput = sum(throughput_list) / len(throughput_list) if throughput_list else 0
token_efficiency = total_output_tokens / total_input_tokens if total_input_tokens else 0

# ✅ PRINT METRICS
print("\nComputational Efficiency Metrics:")
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_output_tokens}")
print(f"Total Time Spent: {total_time_spent:.2f} seconds")
print(f"Average Latency (Time per Summary): {average_latency:.4f} seconds")
print(f"Average TTFT (Time to First Token): {average_ttft:.4f} seconds")
print(f"Average Throughput: {average_throughput:.2f} tokens/second")
print(f"Token Efficiency (TE): {token_efficiency:.4f}")

# ✅ SAVE RESULTS
processed_df.to_csv("FLP_fixed_ttft_fewshot_summaries.csv", index=False)
print("\nSummaries saved to 'FLP_fixed_ttft_fewshot_summaries.csv'")


In [10]:
from tqdm import tqdm
import time

# Stronger Prompt to Prevent Hallucination
FEW_SHOT_EXAMPLES = """
You are an expert at writing clinical summary. Summarize the input text in a very cohesive manner. Maintain storytelling manner. 
Example 1:
Input: 45-year-old male with diabetes presented with chest pain and shortness of breath. ECG showed myocardial infarction. Patient treated with aspirin and admitted to cardiac unit.
Summary: 45-year-old male with diabetes, chest pain, MI confirmed by ECG, treated with aspirin, admitted.

Example 2:
Input: 72-year-old female with history of hypertension and stroke admitted with slurred speech, left-sided weakness. MRI confirmed acute ischemic stroke. Started on anticoagulation and monitored in ICU.
Summary: 72-year-old female with hypertension, stroke, slurred speech, left-sided weakness, ischemic stroke confirmed by MRI, started on anticoagulation.

Example 3 (Correcting Hallucination):
Input: 60-year-old female with chronic kidney disease, admitted for worsening kidney function. Lab tests showed elevated creatinine. Dialysis started.
INCORRECT Summary: 60-year-old female with kidney disease, received kidney transplant. 
CORRECT Summary: 60-year-old female with kidney disease, worsening function, elevated creatinine, started on dialysis.

Do not add or infer anything from the few shot example. If unsure, say 'UNKNOWN':
"""

# Generation parameters
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.6,  # Lowered temperature to reduce hallucination
    "top_k": 40,
    "max_new_tokens": 150,
    "repetition_penalty": 1.2,  # Increased to avoid prompt repetition
    "return_full_text": False
}

batch_size = 4  # Balanced batch size for 8-bit models

# Metrics tracking
total_input_tokens = 0
total_output_tokens = 0
total_time_spent = 0
ttft_list = []  # Time to first token
latency_list = []  # Time per summary
throughput_list = []  # Tokens processed per second

# Batch processing
print("Generating summaries in batches...")
generated_summaries = []

for i in tqdm(range(0, len(processed_df), batch_size), desc="Processing Batches"):
    batch = processed_df["reduced_text"][i:i + batch_size].tolist()

    try:
        # Construct Few-Shot Prompt with input text
        prompts = [f"{FEW_SHOT_EXAMPLES}\n{text}" for text in batch]
        batch_input_tokens = sum(len(tokenizer.encode(prompt)) for prompt in prompts)

        # Measure batch start time
        batch_start_time = time.time()
        summaries = []

        for prompt in prompts:
            single_start_time = time.time()

            # Generate text while measuring TTFT correctly
            output = summarizer(prompt, **generation_params)
            first_token_time = time.time()  # Capture time when first token is generated

            # Store TTFT (time to first token)
            ttft_list.append(first_token_time - single_start_time)

            # Extract generated text correctly
            generated_text = output[0]["generated_text"].strip()
            summaries.append(generated_text)

        batch_end_time = time.time()

        # Compute batch-level metrics
        batch_output_tokens = sum(len(tokenizer.encode(summary)) for summary in summaries)
        batch_latency = batch_end_time - batch_start_time  # Total batch time
        latency_list.append(batch_latency / len(batch))  # Average latency per summary
        throughput_list.append((batch_input_tokens + batch_output_tokens) / batch_latency)

        # Update global metrics
        total_input_tokens += batch_input_tokens
        total_output_tokens += batch_output_tokens
        total_time_spent += batch_latency

        # Store summaries
        generated_summaries.extend(summaries)

    except Exception as e:
        print(f"Error generating summaries for batch starting at index {i}: {e}")
        generated_summaries.extend([""] * len(batch))  # Fill with empty summaries in case of failure

# Add the summaries to the DataFrame
processed_df["generated_summary"] = generated_summaries

# Metrics Calculation
average_latency = sum(latency_list) / len(latency_list) if latency_list else 0
average_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
average_throughput = sum(throughput_list) / len(throughput_list) if throughput_list else 0
token_efficiency = total_output_tokens / total_input_tokens if total_input_tokens else 0

# Print metrics
print("\nComputational Efficiency Metrics:")
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_output_tokens}")
print(f"Total Time Spent: {total_time_spent:.2f} seconds")
print(f"Average Latency (Time per Summary): {average_latency:.4f} seconds")
print(f"Average TTFT (Time to First Token): {average_ttft:.4f} seconds")
print(f"Average Throughput: {average_throughput:.2f} tokens/second")
print(f"Token Efficiency (TE): {token_efficiency:.4f}")

# Save results
processed_df.to_csv("FLP_strict_generated_summaries.csv", index=False)
print("\nSummaries saved to 'FLP_strict_generated_summaries.csv'")


Generating summaries in batches...


Processing Batches:   8%|▊         | 2/25 [01:11<13:14, 34.53s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Batches: 100%|██████████| 25/25 [14:55<00:00, 35.83s/it]


Computational Efficiency Metrics:
Total Input Tokens: 156585
Total Output Tokens: 12955
Total Time Spent: 895.29 seconds
Average Latency (Time per Summary): 8.9529 seconds
Average TTFT (Time to First Token): 8.9529 seconds
Average Throughput: 193.03 tokens/second
Token Efficiency (TE): 0.0827

Summaries saved to 'FLP_strict_generated_summaries.csv'





In [24]:
processed_df.to_csv("FLP_generated_summaries_with_metrics.csv", index=False)
print("\nSummaries saved to 'FLP_generated_summaries_with_metrics.csv'")



Summaries saved to 'FLP_generated_summaries_with_metrics.csv'


In [19]:
from tqdm import tqdm
import time

# Generation parameters
generation_params = {
    "do_sample": True,
    "top_p": 0.8,
    "temperature": 0.7,
    "top_k": 40,
    "max_new_tokens": 100,
    "repetition_penalty": 1.1
}

batch_size = 8  # Adjust based on available memory

# Metrics tracking
total_input_tokens = 0
total_output_tokens = 0
total_time_spent = 0
ttft_list = []  # Time to first token
latency_list = []  # Time per summary
throughput_list = []  # Tokens processed per second

# Batch processing
print("Generating summaries in batches...")
generated_summaries = []

for i in tqdm(range(0, len(processed_df), batch_size), desc="Processing Batches"):
    batch = processed_df["reduced_text"][i:i + batch_size].tolist()

    try:
        # Construct prompts for the batch
        prompts = [f"You are a medical expert. {text}" for text in batch]
        batch_input_tokens = sum(len(tokenizer.encode(prompt)) for prompt in prompts)

        # Generate summaries and measure TTFT and latency
        batch_start_time = time.time()
        summaries = []
        for prompt in prompts:
            single_start_time = time.time()
            output = summarizer(prompt, **generation_params)
            single_end_time = time.time()
            ttft_list.append(single_end_time - single_start_time)  # Time to first token
            summaries.append(output[0]["generated_text"])  # Extract generated text
        batch_end_time = time.time()

        # Calculate batch metrics
        batch_output_tokens = sum(len(tokenizer.encode(summary)) for summary in summaries)
        batch_latency = batch_end_time - batch_start_time  # Total time for the batch
        latency_list.append(batch_latency / len(batch))  # Average latency per summary
        throughput_list.append((batch_input_tokens + batch_output_tokens) / batch_latency)

        # Update global metrics
        total_input_tokens += batch_input_tokens
        total_output_tokens += batch_output_tokens
        total_time_spent += batch_latency

        # Store summaries
        generated_summaries.extend(summaries)

    except Exception as e:
        print(f"Error generating summaries for batch starting at index {i}: {e}")
        generated_summaries.extend([""] * len(batch))  # Fill with empty summaries in case of failure

# Add the summaries to the DataFrame
processed_df["generated_summary"] = generated_summaries

# Metrics Calculation
average_latency = sum(latency_list) / len(latency_list) if latency_list else 0
average_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
average_throughput = sum(throughput_list) / len(throughput_list) if throughput_list else 0
token_efficiency = total_output_tokens / total_input_tokens if total_input_tokens else 0

# Print metrics
print("\nComputational Efficiency Metrics:")
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_output_tokens}")
print(f"Total Time Spent: {total_time_spent:.2f} seconds")
print(f"Average Latency (Time per Summary): {average_latency:.4f} seconds")
print(f"Average TTFT (Time to First Token): {average_ttft:.4f} seconds")
print(f"Average Throughput: {average_throughput:.2f} tokens/second")
print(f"Token Efficiency (TE): {token_efficiency:.4f}")

# Save results
processed_df.to_csv("generated_summaries_with_metrics.csv", index=False)
print("\nSummaries saved to 'generated_summaries_with_metrics.csv'")


Generating summaries in batches...


Processing Batches: 100%|██████████| 13/13 [10:30<00:00, 48.50s/it]


Computational Efficiency Metrics:
Total Input Tokens: 129585
Total Output Tokens: 139251
Total Time Spent: 629.88 seconds
Average Latency (Time per Summary): 6.3091 seconds
Average TTFT (Time to First Token): 6.2988 seconds
Average Throughput: 428.38 tokens/second
Token Efficiency (TE): 1.0746

Summaries saved to 'generated_summaries_with_metrics.csv'



