<a href="https://colab.research.google.com/github/Michael-David-Lam/Medical-Dialogue-Summary/blob/save-model-to-hub/Medical_Dialogue_Generation_DEMOFILE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
!pip install kaggle
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U evaluate
!pip install -U rouge_score
!pip install -U peft
!pip install sentencepiece
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

# Import Dataset GitHub Repo

In [None]:
import kagglehub
import pandas as pd
import re
import numpy as np
!git clone https://github.com/abachaa/MTS-Dialog.git
# Load data
training_data =pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-TrainingSet.csv')
validation_data = pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-ValidationSet.csv')
Test_data = pd.read_csv('/content/MTS-Dialog/Main-Dataset/MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv')
# Rename columns
training_data = training_data.rename(columns={'context': 'input_text', 'target': 'target_text'})

from datasets import Dataset
train_dataset = Dataset.from_pandas(training_data)
val_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(Test_data)


Cloning into 'MTS-Dialog'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 98 (delta 18), reused 3 (delta 3), pack-reused 72 (from 1)[K
Receiving objects: 100% (98/98), 1.19 MiB | 15.80 MiB/s, done.
Resolving deltas: 100% (40/40), done.


# Define Model and Preprocess Data

In [None]:
from transformers import BartTokenizer, BartModel

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

def preprocess_data(df):
    # Define section mapping and ordering
    SECTION_ORDER = [
        ('chief_complaint', 'cc'),
        ('history_of_present_illness', 'genhx'),
        ('past_medical_history', 'pastmedicalhx'),
        ('past_surgeries', 'pastsurgical'),
        ('medications', 'medications'),
        ('allergies', 'allergy'),
        ('social_history', 'fam/sochx'),
        ('educational_courses', 'edcourse'),
        ('review_of_systems', 'ros'),
        ('physical_exam', 'exam'),
        ('assessment', 'assessment'),
        ('exam','exam'),
        ('procedures','procedures'),
        ('labs','labs'),
        ('plan', 'plan'),
        ('disposition', 'disposition')
    ]

    df['dialogue_id'] = df['ID'].astype(str)
    grouped = df.groupby('dialogue_id')

    structured_data = []

    for dialogue_id, group in grouped:
        # Combine all dialogue turns (more robust than iloc[0])
        full_dialogue = ' '.join(group['dialogue'].tolist())

        # Extract all sections
        sections = {}
        for _, row in group.iterrows():
            section_key = row['section_header'].lower().strip()
            sections[section_key] = row['section_text'].strip()

        # Build target text in XML-style format
        target_parts = []
        for standard_name, source_name in SECTION_ORDER:
            if source_name in sections and sections[source_name]:
                target_parts.append(f"<{standard_name}>{sections[source_name]}</{standard_name}>")

        # Add unmapped sections at the end
        for section_key, text in sections.items():
            if section_key not in [x[1] for x in SECTION_ORDER] and text:
                target_parts.append(f"<{section_key}>{text}</{section_key}>")

        target_text = ' '.join(target_parts)

        structured_data.append({
            'input_text': f"Generate clinical note: {full_dialogue}",
            'target_text': target_text,
            'dialogue_id': dialogue_id
        })

    return pd.DataFrame(structured_data)

# Apply preprocessing
training_structured = preprocess_data(training_data)
validation_structured = preprocess_data(validation_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import Dataset

class DoctorPatientDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Access data using .iloc to ensure integer-based indexing
        item = self.data.iloc[idx]  # Use .iloc for integer-based indexing
        input_text = item['input_text']
        target_text = item['target_text']

        # Tokenize inputs
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize targets
        target_encodings = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Replace padding token id with -100 for loss calculation
        labels = target_encodings['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encodings['input_ids'].flatten(),
            'attention_mask': input_encodings['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

## Create Train/Val Tokenized Datasets

In [None]:
# Then create datasets
train_tokenized = DoctorPatientDataset(training_structured, tokenizer)
val_tokenized = DoctorPatientDataset(validation_structured, tokenizer)

## Init Model and Lora Config

In [None]:
from transformers import BartForConditionalGeneration
from peft import LoraConfig, get_peft_model, TaskType
# Example data preparation

# Initialize model with LoRA
model = BartForConditionalGeneration.from_pretrained(model_name)
lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    # target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
# Wrap model with LoRA
model = get_peft_model(model, lora_config)


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    num_beams=1,
    max_length=128
)


# Define Training Args and Metrics

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from transformers import GenerationConfig
from transformers.trainer_utils import IntervalStrategy, SaveStrategy
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 with the pad token id for decoding labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Ensure token IDs are within valid range
    vocab_size = len(tokenizer)
    predictions = np.where(
        (predictions >= 0) & (predictions < vocab_size),
        predictions,
        tokenizer.unk_token_id  # Replace out-of-range IDs with unknown token
    )

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    return {k: round(v, 4) for k, v in result.items()}

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    # Replace 'evaluation_strategy' with 'eval_strategy'
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    generation_max_length=128,
    report_to="none",
    load_best_model_at_end=True,
    logging_strategy ="epoch",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Train Model

In [None]:
trainer.train()
model.save_pretrained("./clinical_note_model")
tokenizer.save_pretrained("./clinical_note_model")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.9077,2.180598,0.2181,0.0771,0.1636,0.1632
2,2.3518,2.081123,0.3161,0.1125,0.2455,0.2445
3,2.2568,2.041169,0.3264,0.1315,0.2551,0.2553
4,2.2001,2.006791,0.334,0.1413,0.2718,0.2721
5,2.1366,1.978344,0.3699,0.1605,0.3042,0.3042
6,2.11,1.951839,0.3457,0.1503,0.2816,0.2812
7,2.0534,1.972139,0.392,0.181,0.3282,0.3272
8,2.0419,1.953992,0.3744,0.1642,0.3031,0.3043
9,2.0194,1.924632,0.4176,0.1978,0.3507,0.3524
10,1.9988,1.92476,0.4058,0.1981,0.3375,0.3373


('./clinical_note_model/tokenizer_config.json',
 './clinical_note_model/special_tokens_map.json',
 './clinical_note_model/vocab.json',
 './clinical_note_model/merges.txt',
 './clinical_note_model/added_tokens.json')

# Generate Summary

In [None]:
from transformers import GenerationConfig
model = BartForConditionalGeneration.from_pretrained("./clinical_note_model")
tokenizer = BartTokenizer.from_pretrained("./clinical_note_model")
# Define your generation config once
generation_config = GenerationConfig(
    temperature=0.7,
    top_k=60,
    top_p=0.95,
    do_sample=True,
    repetition_penalty=2.4,
    no_repeat_ngram_size=2,
    num_beams=1,
    max_length=128  # You can adjust this
)

# Function to generate notes from dialogue
def generate_note(dialogue):
    inputs = tokenizer(
        dialogue,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        generation_config=generation_config  # ✅ This is where it goes
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage

# List of section headers necessary
options = ["CC", "GENHX", "PASTMEDICALHX", "DIAGNOSIS", "PLAN"]

for example in test_dataset:
    if example['section_header'] in options:
        note = generate_note(example['dialogue'])  # Access the 'dialogue' column
        print(example['section_header'])
        print(note)  # Or store the note for later use
        # print(example['dialogue'])


GENHX
<history_of_present_illness>The patient is a 50-year-old African American woman who comes in today for her last visit.  The patient was last seen on 07/08/07 with her MRSI results, and she did not have her chart with me at that time.</History_Of_Present_Illnesses>
GENHX
<history_of_present_illness>The patient is a 23-year-old female who was sedated with Ativan.  The patient did not give us a history of her lungs, but she had some free air under her right diaphragm. She was very short of breath upon arrival and we immediately had X-ray come in to scan her lung and found that she has free oxygen under the right tracheostomy.</history_{defhistory>
GENHX
<history_of_present_illness>The patient is a 71-year-old African American woman who was evaluated by nurses for height and weight.  The patient weighs approximately 1,077 pounds. She has no major medical conditions.</history/of...present _illnesses>
GENHX
<history_of_present_illness>The patient is a 19-year-old female who has some qu

KeyboardInterrupt: 

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Pushing model and tokenizer to Huggingface Hub

# reinit model and tokenizer
model = BartForConditionalGeneration.from_pretrained("./clinical_note_model")
tokenizer = BartTokenizer.from_pretrained("./clinical_note_model")

# Push model and tokenzer
model.push_to_hub("mdlam/clinical-note-model")
tokenizer.push_to_hub("mdlam/clinical-note-model")

adapter_model.safetensors:   0%|          | 0.00/895k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mdlam/clinical-note-model/commit/7fc5bdcbf9562c3bef2cd65ff3974ce248c7b2fb', commit_message='Upload tokenizer', commit_description='', oid='7fc5bdcbf9562c3bef2cd65ff3974ce248c7b2fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mdlam/clinical-note-model', endpoint='https://huggingface.co', repo_type='model', repo_id='mdlam/clinical-note-model'), pr_revision=None, pr_num=None)