In [1]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes
!pip install -q torch

In [2]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'agentsum'  # Replace with your bucket name
soap = f's3://{bucket_name}/sample_summary.csv'

# Load the files
soap = pd.read_csv(soap)

# Display the data
soap.head(99)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,input,output
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi..."
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...
...,...,...
94,"What brings you in? Hi. Uh, I've just had this...",Subjective:\n- Cough for the past week\n- Pers...
95,"Good morning, ma'am. Oh, good morning, doctor....",Subjective:\n- Symptoms: Difficulty using stai...
96,Are you aware of any medical problems in your ...,Subjective:\n- No medical problems reported in...
97,What's going on with you? What brings you here...,Subjective:\n- Symptoms: Loose watery stools\n...


In [3]:
from huggingface_hub import login
from huggingface_hub import whoami
import getpass

# Prompt the user for the Hugging Face token at runtime
hf_token = getpass.getpass("Enter your Hugging Face token: ")

# Log in using the provided token
login(token=hf_token)


print(whoami(token=hf_token))

Enter your Hugging Face token:  ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


{'type': 'user', 'id': '6644f24fe7ae8316ebf3fee4', 'name': 'LizaPiya', 'fullname': 'Fahmida Liza Piya', 'email': 'lizapiya@udel.edu', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/d43d60b3eba464c3f9b44c34e43b64d6.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'Clinical Note LLama', 'role': 'write', 'createdAt': '2024-06-03T19:29:07.142Z'}}}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Medical dialogue summarization function
def generate_summary(dialogue):
    prompt = f"""You are a medical expert. Analyze the following patient-doctor dialogue and provide a concise medical summary in SOAP format (Subjective, Objective, Assessment, Plan):\n\n{dialogue}\n\nSummary:"""
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()

# Apply to SOAP dataset
def process_soap_data(df):
    results = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating summaries"):
        try:
            summary = generate_summary(row['input'])
            summary_tokens = len(tokenizer.encode(summary))
            input_tokens = len(tokenizer.encode(row['input']))
            target_tokens = len(tokenizer.encode(row['output']))
            
            results.append({
                'sample_id': idx,
                'original_input': row['input'],
                'generated_summary': summary,
                'target_summary': row['output'],
                'summary_token_count': summary_tokens,
                'input_tokens': input_tokens,
                'target_tokens': target_tokens
            })
        except Exception as e:
            print(f"Error processing sample {idx}: {str(e)}")
            results.append({
                'sample_id': idx,
                'original_input': row['input'],
                'generated_summary': "ERROR: Could not generate summary",
                'target_summary': row['output'],
                'summary_token_count': 0,
                'input_tokens': len(tokenizer.encode(row['input'])) if pd.notna(row['input']) else 0,
                'target_tokens': len(tokenizer.encode(row['output'])) if pd.notna(row['output']) else 0
            })
    
    return pd.DataFrame(results)

# Load your SOAP dataset
# Replace 'soap' with your actual dataframe variable name
print("Loading and processing SOAP dataset...")
print(f"Total samples to process: {len(soap)}")

# Process the dataset
full_results = process_soap_data(soap)

# Save results
output_filename = 'llama_3.2_3b_summaries_soap_dataset.csv'
full_results.to_csv(output_filename, index=False)
print(f"\nAll results saved to '{output_filename}'")

# Display some basic statistics
print(f"\nDataset Statistics:")
print(f"Total samples processed: {len(full_results)}")
print(f"Successful generations: {len(full_results[full_results['generated_summary'] != 'ERROR: Could not generate summary'])}")
print(f"Failed generations: {len(full_results[full_results['generated_summary'] == 'ERROR: Could not generate summary'])}")

# Display average token counts
successful_results = full_results[full_results['generated_summary'] != 'ERROR: Could not generate summary']
if len(successful_results) > 0:
    print(f"\nAverage token counts:")
    print(f"Input tokens: {successful_results['input_tokens'].mean():.1f}")
    print(f"Target tokens: {successful_results['target_tokens'].mean():.1f}")
    print(f"Generated summary tokens: {successful_results['summary_token_count'].mean():.1f}")

# Show a sample result
if len(successful_results) > 0:
    print(f"\nSample result:")
    sample = successful_results.iloc[0]
    print(f"Input (first 200 chars): {sample['original_input'][:200]}...")
    print(f"Generated summary: {sample['generated_summary']}")
    print(f"Target summary: {sample['target_summary']}")

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loading and processing SOAP dataset...
Total samples to process: 100


Generating summaries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [12:21<00:00,  7.42s/it]


All results saved to 'llama_3.2_3b_summaries_soap_dataset.csv'

Dataset Statistics:
Total samples processed: 100
Successful generations: 100
Failed generations: 0

Average token counts:
Input tokens: 454.6
Target tokens: 157.3
Generated summary tokens: 321.6

Sample result:
Input (first 200 chars): Good afternoon, champ, how you holding up? Good afternoon, Doctor, I have a lot of lower back pain. Oh no, before we begin, how old are you, sir and which hand do you write with? I'm seventy five now....
Generated summary: The patient, a 75-year-old male, presents with worsening lower back pain that began on December 3, 1995. He reports radiating pain down his legs, which started 3 days after the initial pain began, and weakness in his legs. He has been treated with antiinflammatories, which provided temporary relief but did not address the underlying cause of his symptoms. The patient has seen multiple doctors, including a primary care physician and an orthopedist, who performed imaging st




### Traditional Metrics

In [6]:
!pip install -q nltk bert-score
!pip install -q rouge-metric

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
import sys
sys.path.append("..")
# Run the entire script
from Soap_llama import run_llama_evaluation

# Run evaluation
df_results = run_llama_evaluation("llama_3.2_3b_summaries_soap_dataset.csv")


üöÄ Starting Llama-3.2-3B SOAP Evaluation
üìÇ Loaded 100 samples from llama_3.2_3b_summaries_soap_dataset.csv
üìã Data columns: ['sample_id', 'original_input', 'generated_summary', 'target_summary', 'summary_token_count', 'input_tokens', 'target_tokens']
üìè Data shape: (100, 7)
üìä Evaluating 100 Llama-generated summaries...
üìù Summary column: generated_summary
üéØ Reference column: target_summary

üî¢ Computing BLEU and ROUGE-L scores...


Processing Rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:01<00:00, 67.41it/s]



üß† Computing BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


üìä LLAMA-3.2-3B SOAP EVALUATION RESULTS
Metric       Mean ¬± Std           Min      Max      Median  
----------------------------------------------------------------------
BLEU1        15.116 ¬± 9.353    1.55     43.71    12.71   
BLEU2        8.095 ¬± 6.168    0.43     28.85    6.62    
ROUGE_L      12.529 ¬± 5.311    2.48     25.32    12.15   
BERT_P       83.447 ¬± 2.922    77.19    88.60    83.96   
BERT_R       84.581 ¬± 2.540    78.07    89.14    84.84   
BERT_F1      83.986 ¬± 2.338    77.83    88.53    84.20   

üìà BASELINE METRICS SUMMARY (for table)
BLEU-1: 15.12 ¬± 9.35
BLEU-2: 8.10 ¬± 6.17
ROUGE-L: 12.53 ¬± 5.31
BERTScore-F1: 83.99 ¬± 2.34

üìè TOKEN LENGTH ANALYSIS
Generated Summary Tokens:
  Mean ¬± Std: 321.6 ¬± 489.3
  Target: 200 tokens
  Range: 125 - 2248
  Within 180-220: 91/100 (91.0%)

Target Summary Tokens:
  Mean ¬± Std: 157.3 ¬± 118.3
  Range: 37 - 473

üíæ Results saved to: llama_3.2_3b_summaries_soap_dataset_evaluation_results.csv

üìã SAMPLE RESULTS


### Llm_as_a_judge

In [11]:
import sys
sys.path.append("..")
# Run the entire script
%run llm_as_a_judge_llama.py

Testing single Llama-3.2-3B SOAP sample...
üß™ Testing single Llama-3.2-3B SOAP sample evaluation...
Testing sample 0 from 100 total samples
Sample ID: 0
Summary token count: 201


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



üìÑ Source length: 1701 characters
üìù Summary length: 949 characters
üìÑ Source preview: Good afternoon, champ, how you holding up? Good afternoon, Doctor, I have a lot of lower back pain. Oh no, before we begin, how old are you, sir and which hand do you write with? I'm seventy five now....
üìù Summary preview: The patient, a 75-year-old male, presents with worsening lower back pain that began on December 3, 1995. He reports radiating pain down his legs, which started 3 days after the initial pain began, and weakness in his legs. He has been treated with antiinflammatories, which provided temporary relief but did not address the underlying cause of his symptoms. The patient has seen multiple doctors, including a primary care physician and an orthopedist, who performed imaging studies (CT and x-rays) but were unable to identify a clear cause for his symptoms. The patient reports numbness in his legs since December 11.

Subjective: The patient reports worsening lower back pain tha


Does the test look good? Run full evaluation? (y/n):  y



Running full Llama-3.2-3B SOAP evaluation...
üîÑ Loading Llama 3 8B model as judge...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

üìÑ Loaded 100 Llama-3.2-3B generated SOAP summaries
üìã Columns: ['sample_id', 'original_input', 'generated_summary', 'target_summary', 'summary_token_count', 'input_tokens', 'target_tokens']
üîç Evaluating Llama-3.2-3B SOAP summaries for hallucinations...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   1%|          | 1/100 [00:06<11:01,  6.68s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   2%|‚ñè         | 2/100 [00:12<10:11,  6.23s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   3%|‚ñé         | 3/100 [00:18<09:42,  6.00s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   4%|‚ñç         | 4/100 [00:20<07:02,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   5%|‚ñå         | 5/100 [00:22<05:33,  3.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   6%|‚ñå         | 6/100 [00:25<05:11,  3.32s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   7%|‚ñã         | 7/100 [00:28<05:07,  3.30s/it]Setting `pad_


üìä LLAMA-3.2-3B SOAP HALLUCINATION EVALUATION RESULTS:
Metric                    Mean ¬± Std      Min    Max    Perfect Scores
----------------------------------------------------------------------
Hallucination (1-5)       2.20 ¬± 0.51   1.0    3.0    5/100
Factual Consistency (1-5) 3.84 ¬± 0.37   3.0    4.0    0/100
Completeness (1-5)        3.14 ¬± 0.47   2.0    5.0    2/100
Coherence (1-5)           4.38 ¬± 0.80   3.0    5.0    58/100

üìã BASELINE QUALITY INSIGHTS:
‚Ä¢ High hallucination (‚â•4): 0/100 (0.0%)
‚Ä¢ Low factual consistency (‚â§2): 0/100 (0.0%)
‚Ä¢ Good completeness (‚â•4): 15/100 (15.0%)
‚Ä¢ Good coherence (‚â•4): 80/100 (80.0%)

üìä FOR BASELINE TABLE:
Hallucination: 2.20 ¬± 0.51
Factual Consistency: 3.84 ¬± 0.37
Completeness: 3.14 ¬± 0.47
Coherence: 4.38 ¬± 0.80

üíæ Results saved to: llama_3.2_3b_soap_judge_results.csv



