<a href="https://colab.research.google.com/github/Michael-David-Lam/Medical-Dialogue-Summary/blob/Bart-base-model/Medical_Dialogue_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
!pip install kaggle
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U evaluate
!pip install -U rouge_score
!pip install -U peft
!pip install sentencepiece
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

# Import Dataset GitHub Repo

In [2]:
import kagglehub
import pandas as pd
import re
import numpy as np
!git clone https://github.com/abachaa/MTS-Dialog.git

# Load data
training_data =pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-TrainingSet.csv')
validation_data = pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-ValidationSet.csv')
test_data = pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv')

# Rename columns
training_data = training_data.rename(columns={'context': 'input_text', 'target': 'target_text'})

from datasets import Dataset
train_dataset = Dataset.from_pandas(training_data)
val_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)


Cloning into 'MTS-Dialog'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 98 (delta 18), reused 3 (delta 3), pack-reused 72 (from 1)[K
Receiving objects: 100% (98/98), 1.19 MiB | 3.13 MiB/s, done.
Resolving deltas: 100% (40/40), done.


# Define Model and Preprocess Data

In [3]:
from transformers import BartTokenizer, BartModel

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

def preprocess_data(df):
    """
    Focuse on the five key medical sections for summary:
    - Chief complaint (cc)
    - History of present illness (genhx)
    - Past medical history (pastmedicalhx)
    - Diagnosis (diagnosis)
    - Treatment plan (plan)
    """
    # Define only the sections we care about
    TARGET_SECTIONS = [
        ('chief_complaint', 'cc'),
        ('history_of_present_illness', 'genhx'),
        ('past_medical_history', 'pastmedicalhx'),
        ('diagnosis', 'diagnosis'),
        ('treatment_plan', 'plan')
    ]

    df['dialogue_id'] = df['ID'].astype(str)
    grouped = df.groupby('dialogue_id')

    structured_data = []

    for dialogue_id, group in grouped:
        # Combine all dialogue turns
        full_dialogue = ' '.join(group['dialogue'].tolist())

        # Extract all sections
        sections = {}
        for _, row in group.iterrows():
            section_key = row['section_header'].lower().strip()
            sections[section_key] = row['section_text'].strip()

        # Build target text
        target_parts = []
        for standard_name, source_name in TARGET_SECTIONS:
            if source_name in sections and sections[source_name]:
                target_parts.append(f"<{standard_name}>{sections[source_name]}</{standard_name}>")

        target_text = ' '.join(target_parts)

        # Prompt
        input_prompt = (
            f"Summarize the following doctor-patient dialogue into a clinical note "
            f"focusing on chief complaint, history of present illness, past medical history, "
            f"diagnosis, and treatment plan: {full_dialogue}"
        )

        structured_data.append({
            'input_text': input_prompt,
            'target_text': target_text,
            'dialogue_id': dialogue_id
        })

    return pd.DataFrame(structured_data)

# Apply preprocessing
training_structured = preprocess_data(training_data)
validation_structured = preprocess_data(validation_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [4]:
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import Dataset

class DoctorPatientDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Access data using .iloc to ensure integer-based indexing
        item = self.data.iloc[idx]  # Use .iloc for integer-based indexing
        input_text = item['input_text']
        target_text = item['target_text']

        # Tokenize inputs
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize targets
        target_encodings = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Replace padding token id with -100 for loss calculation
        labels = target_encodings['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encodings['input_ids'].flatten(),
            'attention_mask': input_encodings['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

## Create Train/Val Tokenized Datasets

In [5]:
# Create tokenized datasets
train_tokenized = DoctorPatientDataset(training_structured, tokenizer)
val_tokenized = DoctorPatientDataset(validation_structured, tokenizer)

## Init Model and Lora Config

In [6]:
from transformers import BartForConditionalGeneration
from peft import LoraConfig, get_peft_model, TaskType
from peft import LoraConfig, get_peft_model, TaskType
# Example data preparation

# Initialize model with LoRA
model = BartForConditionalGeneration.from_pretrained(model_name)
lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    # target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
# Wrap model with LoRA
model = get_peft_model(model, lora_config)


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    num_beams=1,
    max_length=128
)


# Define Training Args and Metrics

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from transformers import GenerationConfig
from transformers.trainer_utils import IntervalStrategy, SaveStrategy
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 with the pad token id for decoding labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Ensure token IDs are within valid range
    vocab_size = len(tokenizer)
    predictions = np.where(
        (predictions >= 0) & (predictions < vocab_size),
        predictions,
        tokenizer.unk_token_id  # Replace out-of-range IDs with unknown token
    )

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    return {k: round(v, 4) for k, v in result.items()}

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    # Replace 'evaluation_strategy' with 'eval_strategy'
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    generation_max_length=128,
    report_to="none",
    load_best_model_at_end=True,
    logging_strategy ="epoch",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Train Model

In [10]:
trainer.train()
model.save_pretrained("./clinical_note_model")
tokenizer.save_pretrained("./clinical_note_model")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.8676,1.569056,0.0559,0.0226,0.0346,0.0344
2,1.862,1.621934,0.0851,0.0377,0.0591,0.0599
3,1.8316,1.557555,0.0437,0.0195,0.027,0.0279
4,1.8381,1.5713,0.0715,0.0303,0.0478,0.0477
5,1.7624,1.526974,0.0475,0.0209,0.0304,0.0312
6,1.7483,1.552716,0.0734,0.0308,0.0492,0.0497
7,1.7432,1.569546,0.0807,0.0343,0.0551,0.0565
8,1.7063,1.621934,0.1168,0.0548,0.0861,0.0869
9,1.7748,1.592064,0.1138,0.0505,0.0832,0.0849
10,1.7142,1.592773,0.1123,0.0505,0.0846,0.0848


('./clinical_note_model/tokenizer_config.json',
 './clinical_note_model/special_tokens_map.json',
 './clinical_note_model/vocab.json',
 './clinical_note_model/merges.txt',
 './clinical_note_model/added_tokens.json')

# Generate Summary

In [11]:
from transformers import GenerationConfig

# Define your generation config once
generation_config = GenerationConfig(
    temperature=0.7,
    top_k=60,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=2.4,
    no_repeat_ngram_size=2,
    num_beams=1,
    max_length=128  # You can adjust this
)

# Function to generate notes from dialogue
def generate_note(dialogue):
    inputs = tokenizer(
        dialogue,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        generation_config=generation_config  # ✅ This is where it goes
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage

# List of section headers necessary
options = ["CC", "GENHX", "PASTMEDICALHX", "DIAGNOSIS", "PLAN"]

for example in test_dataset:
    if example['section_header'] in options:
        note = generate_note(example['dialogue'])  # Access the 'dialogue' column
        print(example['section_header'])
        print(note)  # Or store the note for later use
        # print(example['dialogue'])


`generation_config` default values have been modified to match model-specific defaults: {'early_stopping': True, 'num_beams': 4, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2, 'pad_token_id': 1, 'bos_token_id': 0, 'eos_token_id': 2, 'decoder_start_token_id': 2}. If this is not desired, please set these values explicitly.


GENHX

GENHX

GENHX

GENHX

GENHX

CC
<history_of_present_illness>The patient is a 43-year-old female who presents with chest pain for the last few nights.  The patient describes it as gnawing sensation that lasts about 10 to 10 seconds, but she has tried taking anything for pain relief and has not been able to get any help.</History_Of_Present_IllNESS>
PLAN

GENHX

CC

PASTMEDICALHX

GENHX
<history_of_present_illness>The patient is a 58-year-old right leg injury.  The patient states that it was about six months ago that the weakness in the right knee started. She has not really remembered how it happened. He says that he has had this weakness for quite some time now and does not remember what happened to him. This is something that I am very concerned about, but I do remember that she has been having this type of weakness over the last six or seven months."  In addition, she describes that her left leg began to have a little bit of pain as well
PASTMEDICALHX

GENHX
<history_of_present

## Enchanced Summary Creation

In [14]:
# Enhanced generation function that returns structured sections
def generate_structured_note(dialogue, model, tokenizer):
    """
    Generates a structured clinical note from dialogue and returns
    the extracted sections in a dictionary
    """
    target_sections = ["chief_complaint", "history_of_present_illness",
                      "past_medical_history", "diagnosis", "treatment_plan"]

    input_text = f"Summarize the following doctor-patient dialogue into a clinical note focusing on chief complaint, history of present illness, past medical history, diagnosis, and treatment plan: {dialogue}"

    inputs = tokenizer(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        generation_config=generation_config
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the specific sections
    sections = extract_sections(generated_text, target_sections)

    return sections, generated_text

In [15]:
# Example usage
def demo_system(model, tokenizer, test_example):
    """
    Demonstrates the system with a test example
    """
    dialogue = test_example['dialogue']

    sections, full_text = generate_structured_note(dialogue, model, tokenizer)

    print("FULL GENERATED TEXT:")
    print("-" * 80)
    print(full_text)
    print("\nEXTRACTED SECTIONS:")
    print("-" * 80)

    for section_name, content in sections.items():
        print(f"### {section_name.upper().replace('_', ' ')} ###")
        print(content)
        print("-" * 40)



In [17]:
import re
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# First, let's add some debug printing and error handling
def evaluate_generated_notes(model, tokenizer, test_dataset, target_sections):
    """
    Evaluates the model on test data and provides section-specific metrics
    """
    results = {section: {"present": 0, "total": 0} for section in target_sections}
    all_generated = []
    all_references = []

    print(f"Starting evaluation on {len(test_dataset)} test examples...")

    # Check if test_dataset is a Dataset object or DataFrame
    if hasattr(test_dataset, 'to_pandas'):
        # Convert to DataFrame if it's a Dataset
        test_df = test_dataset.to_pandas()
    else:
        test_df = test_dataset

    # Process each example
    for i, example in enumerate(test_df.itertuples()):
        try:
            # Check if dialogue attribute exists, otherwise try different attribute names
            if hasattr(example, 'dialogue'):
                dialogue = example.dialogue
            elif hasattr(example, 'input_text'):
                dialogue = example.input_text
            else:
                # Try to access by index for tuples
                dialogue = example[test_df.columns.get_loc('dialogue') + 1]

            print(f"Processing example {i+1}/{len(test_df)}, dialogue length: {len(dialogue)}")

            input_text = f"Summarize the following doctor-patient dialogue into a clinical note focusing on chief complaint, history of present illness, past medical history, diagnosis, and treatment plan: {dialogue}"

            inputs = tokenizer(
                input_text,
                max_length=512,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )

            # Handle device placement
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
                model = model.to('cuda')

            # Generate text with error handling
            try:
                outputs = model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=128,
                    do_sample=True,
                    top_p=0.95,
                    top_k=50
                )

                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                print(f"Generated text length: {len(generated_text)}")
                all_generated.append(generated_text)

                # Check for presence of each target section
                for section in target_sections:
                    pattern = f"<{section}>.*?</{section}>"
                    if re.search(pattern, generated_text, re.DOTALL):
                        results[section]["present"] += 1
                    results[section]["total"] += 1

                # For ROUGE score, we need references
                # Use section_text if available
                if hasattr(example, 'section_text'):
                    all_references.append(example.section_text)
                else:
                    # If no gold reference is available, add an empty string
                    # This will affect ROUGE scores but prevent crashes
                    all_references.append("")

            except RuntimeError as e:
                print(f"Error during generation for example {i}: {str(e)}")
                continue

        except Exception as e:
            print(f"Error processing example {i}: {str(e)}")
            continue

    # Calculate percentages
    if len(results) > 0 and all(data["total"] > 0 for data in results.values()):
        section_coverage = {section: (data["present"] / data["total"]) * 100
                           for section, data in results.items()}
    else:
        section_coverage = {section: 0 for section in target_sections}

    # Calculate ROUGE scores only if we have both predictions and references
    rouge_scores = {}
    if len(all_generated) > 0 and len(all_references) > 0 and len(all_generated) == len(all_references):
        try:
            import evaluate
            rouge = evaluate.load("rouge")
            rouge_scores = rouge.compute(
                predictions=all_generated,
                references=all_references,
                use_stemmer=True
            )
        except Exception as e:
            print(f"Error computing ROUGE scores: {str(e)}")

    print("Evaluation complete!")
    print(f"Section coverage: {section_coverage}")

    return {
        "section_coverage": section_coverage,
        "rouge_scores": rouge_scores,
        "examples": list(zip(all_generated[:5], all_references[:5]))  # Return a few examples
    }

# Updated function to handle edge cases
def extract_sections(generated_text, target_sections):
    """
    Extracts the specific sections from generated text
    """
    extracted = {}
    for section in target_sections:
        pattern = f"<{section}>(.*?)</{section}>"
        match = re.search(pattern, generated_text, re.DOTALL)
        if match:
            extracted[section] = match.group(1).strip()
        else:
            # Try a more flexible pattern in case the model didn't use exact XML tags
            flexible_pattern = f"{section}[:\s]+(.*?)(?=\s*(?:{section}|$))"
            flexible_match = re.search(flexible_pattern, generated_text, re.IGNORECASE | re.DOTALL)
            if flexible_match:
                extracted[section] = flexible_match.group(1).strip()
            else:
                extracted[section] = ""

    return extracted

# Function to test a single example
def test_single_example(model, tokenizer, dialogue):
    """
    Test the model on a single dialogue example and print detailed output
    """
    target_sections = ["chief_complaint", "history_of_present_illness",
                      "past_medical_history", "diagnosis", "treatment_plan"]

    print("Input dialogue:")
    print("-" * 40)
    print(dialogue[:500] + "..." if len(dialogue) > 500 else dialogue)
    print("-" * 40)

    input_text = f"Summarize the following doctor-patient dialogue into a clinical note focusing on chief complaint, history of present illness, past medical history, diagnosis, and treatment plan: {dialogue}"

    inputs = tokenizer(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    # Handle device placement
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        model = model.to('cuda')

    # Generate with basic parameters if generation_config is not defined
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=128,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("\nGenerated text:")
    print("-" * 40)
    print(generated_text)
    print("-" * 40)

    # Extract and display sections
    sections = extract_sections(generated_text, target_sections)

    print("\nExtracted sections:")
    for section, content in sections.items():
        print(f"\n### {section.upper().replace('_', ' ')} ###")
        print(content if content else "[Not found]")

    return generated_text, sections

# Function to process the test dataset with proper error handling
def process_test_dataset(model, tokenizer, test_dataset, num_examples=5):
    """
    Process a few examples from the test dataset to check functionality
    """
    results = []

    # Convert to dataframe if needed
    if hasattr(test_dataset, 'to_pandas'):
        test_df = test_dataset.to_pandas()
    else:
        test_df = test_dataset

    print(f"Test dataset columns: {test_df.columns.tolist()}")

    # Process a limited number of examples
    for i, row in enumerate(test_df.itertuples()):
        if i >= num_examples:
            break

        try:
            # Try to get dialogue field
            if hasattr(row, 'dialogue'):
                dialogue = row.dialogue
            elif hasattr(row, 'input_text'):
                dialogue = row.input_text
            else:
                # Try to access by column index as fallback
                dialogue = row[test_df.columns.get_loc('dialogue') + 1]

            print(f"\nProcessing example {i+1}:")
            generated_text, sections = test_single_example(model, tokenizer, dialogue)
            results.append({
                "dialogue": dialogue,
                "generated_text": generated_text,
                "sections": sections
            })

        except Exception as e:
            print(f"Error processing example {i}: {str(e)}")
            continue

    return results


'\n# 1. First, check if your dataset has the right structure:\nprint(test_dataset.column_names)  # For datasets.Dataset objects\n# OR\nprint(test_data.columns)  # For pandas DataFrames\n\n# 2. Test with a single example first:\nif len(test_dataset) > 0:\n    example = test_dataset[0]\n    dialogue = example[\'dialogue\']  # adjust field name if needed\n    test_single_example(model, tokenizer, dialogue)\n\n# 3. Process a few test examples:\nresults = process_test_dataset(model, tokenizer, test_dataset, num_examples=3)\n\n# 4. Run full evaluation only after verifying the above steps work:\ntarget_sections = ["chief_complaint", "history_of_present_illness", \n                   "past_medical_history", "diagnosis", "treatment_plan"]\neval_results = evaluate_generated_notes(model, tokenizer, test_dataset, target_sections)\n'

In [18]:
# 1. First, check if your dataset has the right structure:
print(test_dataset.column_names)  # For datasets.Dataset objects
# OR
print(test_data.columns)  # For pandas DataFrames

# 2. Test with a single example first:
if len(test_dataset) > 0:
    example = test_dataset[0]
    dialogue = example['dialogue']  # adjust field name if needed
    test_single_example(model, tokenizer, dialogue)

# 3. Process a few test examples:
results = process_test_dataset(model, tokenizer, test_dataset, num_examples=3)

# 4. Run full evaluation only after verifying the above steps work:
target_sections = ["chief_complaint", "history_of_present_illness",
                   "past_medical_history", "diagnosis", "treatment_plan"]
eval_results = evaluate_generated_notes(model, tokenizer, test_dataset, target_sections)

['ID', 'section_header', 'section_text', 'dialogue']
Index(['ID', 'section_header', 'section_text', 'dialogue'], dtype='object')
Input dialogue:
----------------------------------------
Doctor: Good afternoon, sir. Did you just have a birthday? I don't have my chart with me right now, the nurse is bringing it. 
Patient: Good afternoon, sir. Yes, I just turned fifty five. 
Doctor: You identify as African American, correct? 
Patient: Yes, that's right. 
Doctor: When was your last visit, sir? 
Patient: Um, it was on July twenty ninth two thousand eight. 
Doctor: Yes, I see. Did we go over your M R I results? 
Patient: No, I was having those new seizures, remember?
Doctor: Yes, I d...
----------------------------------------

Generated text:
----------------------------------------

----------------------------------------

Extracted sections:

### CHIEF COMPLAINT ###
[Not found]

### HISTORY OF PRESENT ILLNESS ###
[Not found]

### PAST MEDICAL HISTORY ###
[Not found]

### DIAGNOSIS ###
[N