## Fine-tuning Mistral 7B on MTS-Dialog

In [None]:
!pip install -q datasets
!pip install -q accelerate -U
!pip install -q transformers[torch]
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [None]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig, pipeline
from trl import SFTTrainer
from datasets import Dataset, load_dataset

#### Loading the model

In [None]:
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

#### Sanity check

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"

sequences = pipe(
    prompt,
    do_sample=True,
    max_new_tokens=100,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
)
print(sequences[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


As a data scientist, can you explain the concept of regularization in machine learning?

Regularization is a technique used in machine learning models to prevent overfitting and improve the generalization ability of the model. Overfitting occurs when the model learns the noise in the training data instead of the underlying patterns, resulting in poor performance on new, unseen data.

Regularization adds a penalty term to the loss function, which encourages the model to have simpler coefficients or weights. This penalty term discourages large coefficients, making the model more robust and


#### Dataset preparation

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!git clone https://github.com/abachaa/MTS-Dialog.git

Cloning into 'MTS-Dialog'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 98 (delta 18), reused 3 (delta 3), pack-reused 72[K
Receiving objects: 100% (98/98), 1.19 MiB | 7.51 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [None]:
data_files = {"train": "MTS-Dialog/Main-Dataset/MTS-Dialog-TrainingSet.csv", "valid": "MTS-Dialog/Main-Dataset/MTS-Dialog-ValidationSet.csv", "test":  "MTS-Dialog/Main-Dataset/MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv"}
dataset = load_dataset("csv", data_files=data_files)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 1201
    })
    valid: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 100
    })
    test: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue'],
        num_rows: 200
    })
})


In [None]:
def create_prompt(conversation, summary):
  prompt = f"<s>[INST] Write a resume of the following conversation between a doctor and a patient: {conversation}[/INST]{summary}"

  return prompt

In [None]:
processed_data = []
for line in dataset["train"]:
  prompt = create_prompt(line["dialogue"], line["section_text"])
  processed_data.append({"text": prompt})

processed_dataset = Dataset.from_list(processed_data)
print(processed_dataset)
print(processed_dataset["text"][0])

Dataset({
    features: ['text'],
    num_rows: 1201
})
<s>[INST] Write a resume of the following conversation between a doctor and a patient: Doctor: What brings you back into the clinic today, miss? 
Patient: I came in for a refill of my blood pressure medicine. 
Doctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues?  
Patient: No. 
Doctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?
Patient: No.  
Doctor: Great. Also, for our records, how old are you and what race do you identify yourself as?
Patient: I am seventy six years old and identify as a white female.[/INST]The patient is a 76-year-old white female who presents to the clinic today originally for hypertension and a med check.  She has a history of hypertension, os

#### Training Prep

In [None]:
model.config.use_cache = False
model.config_pretraining_tp = 1
model.gradient_checkpointing_enable()

In [None]:
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token = True

In [None]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

#### Training

In [None]:
training_arguments = TrainingArguments(
    output_dir="./mistral-mtsdialog-finetune",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 101.06 MiB is free. Process 2266 has 14.65 GiB memory in use. Of the allocated memory 14.13 GiB is allocated by PyTorch, and 387.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF