## Mistral Unsupervised fine-tuning using depression symptoms classifier

In [1]:
!pip install -q datasets
!pip install -q accelerate
!pip install -q transformers[torch]
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q trl
!pip install -q huggingface_hub

[0m

In [65]:
import os
import csv
import re
from datasets import Dataset, load_dataset
from accelerate import Accelerator
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig, pipeline, AdamW, Trainer
from trl import SFTTrainer
import huggingface_hub
from math import sqrt

In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
huggingface_hub.login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#### Dataset prep

In [8]:
!git clone https://github.com/HikariLight/dataset.git

fatal: destination path 'dataset' already exists and is not an empty directory.


In [13]:
path = "./dataset/transcripts/"
files = os.listdir(path)

dataset = []

ellie_regex = r'\((.*?)\)'

conversations = []
raw_dataset = {"conversations": [], "patient_dialogue": []}

for f in files:
  if f[-4:] == ".csv":
    with open(path + f, 'r') as file:
      csv_reader = csv.reader(file)

      conversation = ""
      patient_dialogue = ""
      for row in csv_reader:
          if(len(row) > 0):
            convo_turn = row[0].split("\t")
            speaker, content = convo_turn[2], convo_turn[3]

            if speaker == "Ellie":
              if re.search(ellie_regex, content):
                ellie_speech = re.search(ellie_regex, content).group(1)
                conversation += "Doctor: " + ellie_speech + "\n"
              else:
                conversation += "Doctor: " + content + "\n"

            if speaker == "Participant":
              conversation += "Patient: " + content + "\n"
              patient_dialogue += "Patient: " + content + "\n"

      raw_dataset["conversations"].append(conversation)
      raw_dataset["patient_dialogue"].append(patient_dialogue)

In [14]:
dataset = Dataset.from_dict(raw_dataset)

print(dataset)

Dataset({
    features: ['conversations', 'patient_dialogue'],
    num_rows: 189
})


In [15]:
def create_prompt(conversation, summary):
  return f"[INST] Write a summary of the following conversation between a doctor and a patient: {conversation} [/INST]{summary}"

In [16]:
processed_daic_woz = []

for line in dataset:
  prompt = create_prompt(line["conversations"], line["patient_dialogue"])
  processed_daic_woz.append({"text": prompt})

daic_woz_processed_dataset = Dataset.from_list(processed_daic_woz)
print(daic_woz_processed_dataset)

Dataset({
    features: ['text'],
    num_rows: 189
})


#### Sanity check

#### Custom training loop

In [15]:
accelerator = Accelerator()

In [18]:
optimizer = AdamW(model.parameters(), lr=3e-5)



In [72]:
class MistralTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

    def compute_emotions(self, text):
      result = None
      combined_scores = {}
    
      text_length = len(text)
    
      for start in range(0, text_length, 512):
        end = min(text_length, start+512)
        chunk = text[start: end]
    
        emotions = self.classifier(chunk)[0]
    
      for emotion in emotions:
        label = emotion["label"]
    
        if label not in combined_scores.keys():
          combined_scores[label] = emotion["score"]
        else:
          combined_scores[label] += emotion["score"]

      return [{'label': label, 'score': score} for label, score in combined_scores.items()]
    
    def distance(self, list1, list2):
      if len(list1) != len(list2):
        raise ValueError("Lists must have the same length.")
    
      distance = 0
    
      for i in range(len(list1)):
        dict1 = list1[i]
        dict2 = list2[i]
    
        if set(dict1.keys()) != set(dict2.keys()):
          raise ValueError("Dictionaries must have the same labels.")
    
        for label in dict1.keys():
          if label != 'label':
            distance += (dict1[label] - dict2[label])**2
    
      return sqrt(distance)

    def compute_loss(self, model, inputs):
        generated_ids = model.generate(**inputs, do_sample=True)

        # input = self.tokenizer.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens = True)[0]
        prompt = self.tokenizer.batch_decode(inputs["input_ids"])[0]
        # summary = self.tokenizer.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens = True)[0]
        summary = self.tokenizer.batch_decode(generated_ids, skip_special_tokens = True)[0]
        
        ground_truth = self.compute_emotions(prompt)
        prediction = self.compute_emotions(summary)

        loss = self.distance(ground_truth, prediction)
        loss = torch.tensor(loss, requires_grad=True)
        return loss

In [20]:
model.config.use_cache = False
model.config_pretraining_tp = 1
model.gradient_checkpointing_enable()

tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token = True

In [21]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

In [73]:
training_arguments = TrainingArguments(
    output_dir="./mistral-daic-woz-unsupervised-finetune",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=10,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    push_to_hub=True,
)

In [74]:
trainer = MistralTrainer(
    model=model,
    train_dataset=daic_woz_processed_dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/189 [00:00<?, ? examples/s]

In [75]:
trainer.train()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Step,Training Loss
10,0.076
20,0.075
30,0.1568
40,0.0859


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token

TrainOutput(global_step=48, training_loss=0.0869731620574991, metrics={'train_runtime': 116.9012, 'train_samples_per_second': 1.617, 'train_steps_per_second': 0.411, 'total_flos': 8364163118137344.0, 'train_loss': 0.0869731620574991, 'epoch': 1.0})