## Supervised fine-tuning of Mistral on Medical Report generation datasets

In [4]:
!pip install -q transformers[torch]
!pip install -q datasets
!pip install -q huggingface_hub
!pip install -q accelerate -U
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q trl
!pip install -q huggingface_hub

[0m

In [1]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig, pipeline
from trl import SFTTrainer
from datasets import Dataset, load_dataset
import huggingface_hub

In [2]:
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [4]:
huggingface_hub.login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Fine-tuning Mistral 7B on MTS-Dialog

#### Dataset preparation

In [5]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!git clone https://github.com/abachaa/MTS-Dialog.git

fatal: destination path 'MTS-Dialog' already exists and is not an empty directory.


In [6]:
mts_dialog_files = {"train": "MTS-Dialog/Main-Dataset/MTS-Dialog-TrainingSet.csv", "valid": "MTS-Dialog/Main-Dataset/MTS-Dialog-ValidationSet.csv", "test":  "MTS-Dialog/Main-Dataset/MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv"}
mts_dialog_dataset = load_dataset("csv", data_files=mts_dialog_files)

print(mts_dialog_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating valid split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 1201
    })
    valid: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 100
    })
    test: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 200
    })
})


  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [7]:
def create_prompt(conversation, summary):
  return f"[INST] Write a summary of the following conversation between a doctor and a patient: {conversation} [/INST]{summary}"

In [8]:
processed_data = []
for line in mts_dialog_dataset["train"]:
  prompt = create_prompt(line["dialogue"], line["section_text"])
  processed_data.append({"text": prompt})

mts_dialog_processed_dataset = Dataset.from_list(processed_data)
print(mts_dialog_processed_dataset)
print(mts_dialog_processed_dataset["text"][0])

Dataset({
    features: ['text'],
    num_rows: 1201
})
[INST] Write a summary of the following conversation between a doctor and a patient: Doctor: What brings you back into the clinic today, miss? 
Patient: I came in for a refill of my blood pressure medicine. 
Doctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues?  
Patient: No. 
Doctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?
Patient: No.  
Doctor: Great. Also, for our records, how old are you and what race do you identify yourself as?
Patient: I am seventy six years old and identify as a white female. [/INST]The patient is a 76-year-old white female who presents to the clinic today originally for hypertension and a med check.  She has a history of hypertension, osteoarthr

#### Training Prep

In [9]:
model.config.use_cache = False
model.config_pretraining_tp = 1
model.gradient_checkpointing_enable()

In [10]:
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token = True

In [11]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

#### Training

In [12]:
training_arguments = TrainingArguments(
    output_dir="./mistral-mtsdialog-finetune",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    push_to_hub=True,
)

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=mts_dialog_processed_dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
trainer.train()



Step,Training Loss
25,1.4999
50,1.1158
75,1.4102
100,1.0139
125,1.3303
150,1.0078
175,1.2892
200,0.9741
225,1.299
250,0.9103




TrainOutput(global_step=301, training_loss=1.16919457714423, metrics={'train_runtime': 488.9387, 'train_samples_per_second': 2.456, 'train_steps_per_second': 0.616, 'total_flos': 1.3347844883447808e+16, 'train_loss': 1.16919457714423, 'epoch': 1.0})

## ACI-Bench Supervised finetune

In [15]:
!git clone https://github.com/wyim/aci-bench

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning into 'aci-bench'...
remote: Enumerating objects: 528, done.[K
remote: Counting objects: 100% (528/528), done.[K64% (338/528)[K
remote: Compressing objects: 100% (194/194), done.[K
remote: Total 528 (delta 357), reused 488 (delta 328), pack-reused 0[K
Receiving objects: 100% (528/528), 8.98 MiB | 11.74 MiB/s, done.
Resolving deltas: 100% (357/357), done.


In [16]:
aci_bench_files = {"train": "aci-bench/data/challenge_data/train.csv", "valid": "aci-bench/data/challenge_data/valid.csv", "test":  "aci-bench/data/challenge_data/clinicalnlp_taskB_test1.csv"}
aci_bench_dataset = load_dataset("csv", data_files=aci_bench_files)

print(aci_bench_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating valid split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dataset', 'encounter_id', 'dialogue', 'note'],
        num_rows: 67
    })
    valid: Dataset({
        features: ['dataset', 'encounter_id', 'dialogue', 'note'],
        num_rows: 20
    })
    test: Dataset({
        features: ['dataset', 'encounter_id', 'dialogue', 'note'],
        num_rows: 40
    })
})


  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [17]:
processed_aci_bench = []
for line in aci_bench_dataset["train"]:
  prompt = create_prompt(line["dialogue"], line["note"])
  processed_aci_bench.append({"text": prompt})

aci_bench_processed_dataset = Dataset.from_list(processed_aci_bench)
print(aci_bench_processed_dataset)
print(aci_bench_processed_dataset["text"][0])

Dataset({
    features: ['text'],
    num_rows: 67
})
[INST] Write a summary of the following conversation between a doctor and a patient: [doctor] hi , martha . how are you ?
[patient] i'm doing okay . how are you ?
[doctor] i'm doing okay . so , i know the nurse told you about dax . i'd like to tell dax a little bit about you , okay ?
[patient] okay .
[doctor] martha is a 50-year-old female with a past medical history significant for congestive heart failure , depression and hypertension who presents for her annual exam . so , martha , it's been a year since i've seen you . how are you doing ?
[patient] i'm doing well . i've been traveling a lot recently since things have , have gotten a bit lighter . and i got my , my vaccine , so i feel safer about traveling . i've been doing a lot of hiking . uh , went to washington last weekend to hike in northern cascades, like around the mount baker area .
[doctor] nice . that's great . i'm glad to hear that you're staying active , you know . i

In [18]:
training_arguments = TrainingArguments(
    output_dir="./mistral-mtsdialog-acibench-finetune",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=5,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    push_to_hub=True,
)

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=aci_bench_processed_dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
trainer.train()



Step,Training Loss
5,1.8568
10,1.7128
15,1.7041
20,1.6926
25,1.5849
30,1.5942




TrainOutput(global_step=34, training_loss=1.6817880658542408, metrics={'train_runtime': 100.6993, 'train_samples_per_second': 0.665, 'train_steps_per_second': 0.338, 'total_flos': 2965073697964032.0, 'train_loss': 1.6817880658542408, 'epoch': 1.0})

## Evaluation

#### MTS-Dialog

In [None]:
print("MTS-Dialog")

#### ACI-Bench

In [None]:
print("ACI-Bench")