In [1]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes
!pip install -q torch

In [2]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'agentsum'  # Replace with your bucket name
soap = f's3://{bucket_name}/sample_summary.csv'

# Load the files
soap = pd.read_csv(soap)

# Display the data
soap.head(99)

Unnamed: 0,input,output
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi..."
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...
...,...,...
94,"What brings you in? Hi. Uh, I've just had this...",Subjective:\n- Cough for the past week\n- Pers...
95,"Good morning, ma'am. Oh, good morning, doctor....",Subjective:\n- Symptoms: Difficulty using stai...
96,Are you aware of any medical problems in your ...,Subjective:\n- No medical problems reported in...
97,What's going on with you? What brings you here...,Subjective:\n- Symptoms: Loose watery stools\n...


In [3]:
from huggingface_hub import login
from huggingface_hub import whoami
import getpass

# Prompt the user for the Hugging Face token at runtime
hf_token = getpass.getpass("Enter your Hugging Face token: ")

# Log in using the provided token
login(token=hf_token)


print(whoami(token=hf_token))

Enter your Hugging Face token:  ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


{'type': 'user', 'id': '6644f24fe7ae8316ebf3fee4', 'name': 'LizaPiya', 'fullname': 'Fahmida Liza Piya', 'email': 'lizapiya@udel.edu', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/d43d60b3eba464c3f9b44c34e43b64d6.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'Clinical Note LLama', 'role': 'write', 'createdAt': '2024-06-03T19:29:07.142Z'}}}


In [6]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.3/1.3 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load model with safetensors
model_name = "medalpaca/medalpaca-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    use_safetensors=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Medical dialogue summarization function - CHANGE 1: Updated for SOAP dialogue
def generate_summary(dialogue):
    # CHANGE 2: Updated prompt for medical dialogue and SOAP format
    prompt = (
        "### Instruction:\n"
        "You are a medical summarization expert. Analyze the following patient-doctor dialogue and provide a concise medical summary in SOAP format (Subjective, Objective, Assessment, Plan) in no more than 150 words:\n\n"
        f"{dialogue}\n\n"
        "### Response:"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,  # Keep same token count
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()

# Apply to SOAP dataset - CHANGE 3: Updated function name and structure
def process_soap_data(df):
    results = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating summaries"):
        try:
            summary = generate_summary(row['input'])
            summary_tokens = len(tokenizer.encode(summary))
            input_tokens = len(tokenizer.encode(row['input']))
            target_tokens = len(tokenizer.encode(row['output']))  # CHANGE 4: 'output' instead of 'target'
            
            results.append({
                'sample_id': idx,  # CHANGE 5: sample_id instead of note_id
                'original_input': row['input'],
                'generated_summary': summary,
                'target_summary': row['output'],  # CHANGE 6: 'output' column from SOAP
                'summary_token_count': summary_tokens,
                'input_tokens': input_tokens,
                'target_tokens': target_tokens
            })
        except Exception as e:
            print(f"Error processing sample {idx}: {str(e)}")
            results.append({
                'sample_id': idx,
                'original_input': row['input'],
                'generated_summary': f"ERROR: {str(e)}",
                'target_summary': row['output'],
                'summary_token_count': 0,
                'input_tokens': len(tokenizer.encode(row['input'])) if pd.notna(row['input']) else 0,
                'target_tokens': len(tokenizer.encode(row['output'])) if pd.notna(row['output']) else 0
            })
    
    return pd.DataFrame(results)

# Run the pipeline - CHANGE 7: Updated for SOAP dataset
print("Loading and processing SOAP dataset...")
print(f"Total samples to process: {len(soap)}")  # CHANGE 8: 'soap' instead of 'mimic_iv_bhc_100'

full_results = process_soap_data(soap)  # CHANGE 9: Updated function call

# Save results - CHANGE 10: Update output filename for SOAP
output_path = 'medalpaca_summaries_soap_dataset.csv'
full_results.to_csv(output_path, index=False)
print(f"\nAll results saved to '{output_path}'")

# Display some basic statistics
print(f"\nDataset Statistics:")
print(f"Total samples processed: {len(full_results)}")
print(f"Successful generations: {len(full_results[~full_results['generated_summary'].str.startswith('ERROR:')])}")
print(f"Failed generations: {len(full_results[full_results['generated_summary'].str.startswith('ERROR:')])}")

# Display average token counts
successful_results = full_results[~full_results['generated_summary'].str.startswith('ERROR:')]
if len(successful_results) > 0:
    print(f"\nAverage token counts:")
    print(f"Input tokens: {successful_results['input_tokens'].mean():.1f}")
    print(f"Target tokens: {successful_results['target_tokens'].mean():.1f}")
    print(f"Generated summary tokens: {successful_results['summary_token_count'].mean():.1f}")

# Show a sample result
if len(successful_results) > 0:
    print(f"\nSample result:")
    sample = successful_results.iloc[0]
    print(f"Input (first 200 chars): {sample['original_input'][:200]}...")
    print(f"Generated summary: {sample['generated_summary']}")
    print(f"Target summary: {sample['target_summary']}")

The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['pad_token_id']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading and processing SOAP dataset...
Total samples to process: 100


Generating summaries:   1%|          | 1/100 [00:19<32:55, 19.95s/it]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Token indices sequence length is longer than the specified maximum sequence length for this model (2248 > 512). Running this sequence through the model will result in indexing errors
Generating summaries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [05:58<00:00,  3.58s/it]


All results saved to 'medalpaca_summaries_soap_dataset.csv'

Dataset Statistics:
Total samples processed: 100
Successful generations: 100
Failed generations: 0

Average token counts:
Input tokens: 503.4
Target tokens: 201.1
Generated summary tokens: 295.5

Sample result:
Input (first 200 chars): Good afternoon, champ, how you holding up? Good afternoon, Doctor, I have a lot of lower back pain. Oh no, before we begin, how old are you, sir and which hand do you write with? I'm seventy five now....
Generated summary: This patient has had chronic lower back pain for 10 days, which is worsening, with radiating pain to both legs. His symptoms began after he was diagnosed with aortic stenosis, which may be contributing to his pain.
Target summary: Subjective:
- Symptoms: Lower back pain, radiating pain down the right leg, then the left leg.
- Severity: Severe pain, described as "I could barely walk" and "the pain was so severe."
- Duration: Lower back pain for about ten days; radiating pain 




### Traditional Metrics

In [5]:
!pip install -q nltk bert-score
!pip install -q rouge-metric

In [6]:
%run medalpaca.py

Matplotlib is building the font cache; this may take a moment.


üöÄ Starting MedAlpaca-7B SOAP Evaluation
üìÇ Loaded 100 samples from medalpaca_summaries_soap_dataset.csv
üìã Data columns: ['sample_id', 'original_input', 'generated_summary', 'target_summary', 'summary_token_count', 'input_tokens', 'target_tokens']
üìè Data shape: (100, 7)
üìä Evaluating 100 MedAlpaca-generated summaries...
üìù Summary column: generated_summary
üéØ Reference column: target_summary

üî¢ Computing BLEU and ROUGE-L scores...


Processing Rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:01<00:00, 60.30it/s]



üß† Computing BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


üìä MEDALPACA-7B SOAP EVALUATION RESULTS
Metric       Mean ¬± Std           Min      Max      Median  
----------------------------------------------------------------------
BLEU1        14.413 ¬± 9.870    0.00     39.98    13.98   
BLEU2        7.160 ¬± 6.235    0.00     26.93    5.21    
ROUGE_L      15.754 ¬± 9.195    0.00     42.42    13.88   
BERT_P       86.726 ¬± 4.026    77.33    94.56    87.10   
BERT_R       83.405 ¬± 3.336    72.82    91.43    83.97   
BERT_F1      85.008 ¬± 3.391    77.22    92.86    85.56   

üìà BASELINE METRICS SUMMARY (for table)
BLEU-1: 14.41 ¬± 9.87
BLEU-2: 7.16 ¬± 6.23
ROUGE-L: 15.75 ¬± 9.20
BERTScore-F1: 85.01 ¬± 3.39

üìè TOKEN LENGTH ANALYSIS
Generated Summary Tokens:
  Mean ¬± Std: 295.5 ¬± 641.3
  Target: 150 tokens
  Range: 4 - 2248
  Within 130-170: 6/100 (6.0%)

Target Summary Tokens:
  Mean ¬± Std: 201.1 ¬± 145.7
  Range: 51 - 589

üíæ Results saved to: medalpaca_summaries_soap_dataset_evaluation_results.csv

üìã SAMPLE RESULTS
Sample 

In [7]:
%run llm_as_a_judge_medalpaca.py

Testing single MedAlpaca SOAP sample...
üß™ Testing single MedAlpaca SOAP sample evaluation...
Testing sample 0 from 100 total samples
Sample ID: 0
Summary token count: 56


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



üìÑ Source length: 1701 characters
üìù Summary length: 214 characters
üìÑ Source preview: Good afternoon, champ, how you holding up? Good afternoon, Doctor, I have a lot of lower back pain. Oh no, before we begin, how old are you, sir and which hand do you write with? I'm seventy five now....
üìù Summary preview: This patient has had chronic lower back pain for 10 days, which is worsening, with radiating pain to both legs. His symptoms began after he was diagnosed with aortic stenosis, which may be contributing to his pain.

üìù Prompt length: 2347 characters

ü§ñ MODEL RESPONSE:


Please rate the generated summary against the source patient-doctor conversation. 

Hallucination: 2 (The summary mentions aortic stenosis, which is not mentioned in the conversation)
Factual: 3 (The summary is mostly accurate, but it does not mention the patient's age, the initial date of the back pain, or the patient's ability to walk)
Complete: 2 (The summary is missing key information, such as the


Does the test look good? Run full evaluation? (y/n):  y



Running full MedAlpaca SOAP evaluation...
üîÑ Loading Llama 3 8B model as judge...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

üìÑ Loaded 100 MedAlpaca-7B generated SOAP summaries
üìã Columns: ['sample_id', 'original_input', 'generated_summary', 'target_summary', 'summary_token_count', 'input_tokens', 'target_tokens']
üîç Evaluating MedAlpaca SOAP summaries for hallucinations...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   1%|          | 1/100 [00:04<07:10,  4.35s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   2%|‚ñè         | 2/100 [00:07<06:23,  3.91s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   3%|‚ñé         | 3/100 [00:11<05:46,  3.57s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   4%|‚ñç         | 4/100 [00:17<07:15,  4.54s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   5%|‚ñå         | 5/100 [00:23<08:01,  5.07s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   6%|‚ñå         | 6/100 [00:29<08:26,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   7%|‚ñã         | 7/100 [00:34<08:10,  5.27s/it]Setting `pad_


üìä MEDALPACA-7B SOAP HALLUCINATION EVALUATION RESULTS:
Metric                    Mean ¬± Std      Min    Max    Perfect Scores
----------------------------------------------------------------------
Hallucination (1-5)       2.13 ¬± 0.44   1.0    4.0    3/100
Factual Consistency (1-5) 3.86 ¬± 0.38   2.0    4.0    0/100
Completeness (1-5)        3.07 ¬± 0.38   2.0    5.0    2/100
Coherence (1-5)           4.68 ¬± 0.66   3.0    5.0    79/100

üìã BASELINE QUALITY INSIGHTS:
‚Ä¢ High hallucination (‚â•4): 1/100 (1.0%)
‚Ä¢ Low factual consistency (‚â§2): 1/100 (1.0%)
‚Ä¢ Good completeness (‚â•4): 7/100 (7.0%)
‚Ä¢ Good coherence (‚â•4): 89/100 (89.0%)

üìä FOR BASELINE TABLE:
Hallucination: 2.13 ¬± 0.44
Factual Consistency: 3.86 ¬± 0.38
Completeness: 3.07 ¬± 0.38
Coherence: 4.68 ¬± 0.66

üíæ Results saved to: medalpaca_soap_judge_results.csv



