# Install the packages

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops
!pip install -q wandb

from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftModel

from trl import SFTTrainer

from huggingface_hub import login

import wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m211.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.6 MB/s[0m eta [36m0

# Choose the desired model and dataset

In [None]:
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "bigscience/bloomz-560m"
# model_name = "meta-llama/Meta-Llama-3-8B"
dataset_name = "heliosbrahma/mental_health_chatbot_dataset"
split = "train[:10%]"
# split = "train"
finetune_model_name = "bloomz-560m-mentalhealthbot" # the name for the model after the model is finetuned

# device_map = {"": 0} # tries to fit the entire model on the device 0

device_map = "auto" # only a subset of all layers is allocated to one GPU, which should lead to lower GPU consumption

# Define LoRA Parameters *Configuration*

In [None]:
peft_config = LoraConfig(
      lora_alpha=64,
      lora_dropout=0.1,
      r=16,
      bias="none",
      task_type="CAUSAL_LM",
)

# Define QLoRA Parameters Configuration (BitsAndBytes Library)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

# Login to Huggingface and Wanb and **Load** the dataset

In [None]:
from huggingface_hub import notebook_login
# Log in to HF Hub
notebook_login()

wandb.login()
%env WANDB_PROJECT=python-fine-tuning

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[34m[1mwandb[0m: Currently logged in as: [33m0132114[0m ([33m0132114-uow-malaysia[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
dataset = load_dataset(dataset_name, split=split)
# dataset = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/172 [00:00<?, ? examples/s]

# Prompt Instruction (Prompt Engineering)

In [None]:
def prompt_instruction_format(sample):
  return f"""### Instruction:
    Use the text input by <HUMAN> to output the text by <ASSISTANT>, if you don't know the answer, just say you don't know, don't try to generate an answer.

    ### input
    {sample['text']}
    """

# Load the Chosen Model using AutoModelForCausalLM
- the QLoRA configuration will be passed , so that the model is loaded as quantized

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache = False,
    device_map=device_map)

base_model.config.pretraining_tp = 1 #Setting config.pretraining_tp to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.

# Define the Tokenizer for the model (AutoTokenizer by HuggingFace)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Define the Training Arguments and Train the Model
- These arguments will be used by the trainer to fine-tune the model

In [None]:
trainingArgs = TrainingArguments(
    output_dir=finetune_model_name,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    seed=42
)

# Create the trainer object
- passing the LoRA configurations here so that the training will be done on the low-rank adapter, not the base model

In [None]:
# Create the trainer, passing the LoRA configurations here so that the training will be done on the low-rank adapter, not the base model
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


# Start training

In [None]:
print("Start the supervised Fine tuning")
trainer.train()
print("Done Training")

#stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()
print("Model saved")

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112877655556177, max=1.0…



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
total_flos,22404931780608.0
train/epoch,3.0
train/global_step,3.0
train_loss,1.17668
train_runtime,28.4044
train_samples_per_second,0.211
train_steps_per_second,0.106


# Load the trained model from the output directory

In [None]:
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    trainingArgs.output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16, # same as bnb_4bit_compute_dtype
    device_map=device_map,
)


#  Merge LoRA to the main model and push to HuggingFace Hub
- merge the LORA layers with the base model

In [None]:
# Merge LoRA with the base model and save the merged model
lora_merged_model = trained_model.merge_and_unload()
lora_merged_model.save_pretrained("merged",safe_serialization=True)
tokenizer.save_pretrained("merged")

#push merged model to the hub
lora_merged_model.push_to_hub("bloomz-560m-mentalhealthchatbot") # the name of the model you want
tokenizer.push_to_hub("bloomz-560m-mentalhealthchatbot")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hezronling/bloomz-560m-mentalhealthchatbot/commit/741b5c8e66fd144d9b551d658a52c855165427f1', commit_message='Upload tokenizer', commit_description='', oid='741b5c8e66fd144d9b551d658a52c855165427f1', pr_url=None, pr_revision=None, pr_num=None)