# 1.0 Install Packages and Import Libraries

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score wandb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.8 MB/s[0m eta [36m0:

In [2]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
from huggingface_hub import login
import wandb
import os

# # Login to HuggingFace
# interpreter_login()

# Login to Huggingface
api_token = "<API KEY HERE>"
login(token=api_token)

In [3]:
# Login to weights and biases (to track training metrics)
# # wandb.login()
# # %env WANDB_PROJECT=Fine-Tune-QLoRA

#---------------------------------------------------------------

# Set your W&B API key here
os.environ["WANDB_API_KEY"] = "<API KEY HERE>"

# Log in to W&B
wandb.login()

# Set your W&B project
os.environ["WANDB_PROJECT"] = "Fine-Tune-QLoRA"

print("Successfully logged into Weights & Biases!")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m0132114[0m ([33m0132114-uow-malaysia[0m). Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged into Weights & Biases!


# 2.0 Load the Processed Dataset

In [4]:
from google.colab import drive
from datasets import load_from_disk

drive.mount('/content/drive')

# Load datasets
train_dataset = load_from_disk('/content/drive/My Drive/mental_health_dataset/hf_train_dataset')
val_dataset = load_from_disk('/content/drive/My Drive/mental_health_dataset/hf_val_dataset')

print("Datasets loaded!")

Mounted at /content/drive
Datasets loaded!


In [5]:
print(train_dataset)
print(val_dataset)

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 561
})
Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 70
})


# 3.0 Configure Bits and Bytes

In [56]:
# Ensure the computation uses 16-bit floating-point (reduce memory usage, speed up training)
compute_dtype = getattr(torch, "float16")

# Configure Bits and Bytes to load the model in 4-bit (quantized)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, # Load the weights in 4 bit
        bnb_4bit_quant_type='nf4', # Use nf4 datatype
        bnb_4bit_compute_dtype=compute_dtype, # Uses 16-bit floating-point (float16)
        bnb_4bit_use_double_quant=True, # Enable double quantization
    )

# 4.0 Load the Pretrained Model in 4-bit (Quantized)

In [57]:
# Load the pretrained model, 'meta-llama/Llama-3.2-1B-Instruct' required authorization
base_model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
device_map = "auto" #{"": 0}
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config, # To load in 4-bit and double quantization
                                                      trust_remote_code=True,
                                                      use_cache = False,
                                                      use_auth_token=True)

# 5.0 Configure the Tokenizer

In [58]:
# Configure the tokenizer, use left-padding to optimize memory usage during training.
tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                          trust_remote_code=True,
                                          padding_side="left",
                                          add_eos_token=True,
                                          add_bos_token=True,
                                          use_fast=False)

tokenizer.pad_token = tokenizer.eos_token

# 6.0 Test the Base Model's Response

In [59]:
# Insert prompt
prompt = "Something happened this summer that I cannot forgive myself for. When I think about what happened, I feel ashamed and guilty even though my loved ones forgave me."

# Format the prompt
messages = [
    {"role": "system", "content": "You are a helpful mental health therapist."},
    {"role": "user", "content": prompt}
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Keep text as string
    add_generation_prompt=True # Adds additional instructions (if needed)
)

# Tokenize the text
model_inputs = tokenizer([text], return_tensors="pt").to(base_model.device)

# Generate response
generated_ids = base_model.generate(
    **model_inputs,
    max_new_tokens=512,
    temperature=0.1
)

# Get the generated tokens
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

# Decode the tokens into text
base_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("BASE MODEL RESPONSE \n============================================== \n", base_response)

BASE MODEL RESPONSE 
 I'm here to listen. It's important to acknowledge your feelings and work through them in a safe space. Can you tell me more about what happened? What was the situation like? How did it make you feel? And how does it affect you now?


# 8.0 Fine Tuning

## 8.1 Configure LoRA and Initialize LoRA adapter (LoRA trainable version of the model)
- LoRA adapter: 2 smaller matrices that are fine tuned

In [43]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configure the LoRA parameters
config = LoraConfig(
    r=64, # Rank, no. of parameters trained (E.g., for a 512x512 (262144) matrix, if rank = 64, the LoRA adapter uses 512x64 and 64x512 parameters.)
    lora_alpha=128, # Alpha, how much the model adapts to the new training data.
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# Enable gradient checkpointing to reduce memory usage during fine-tuning
base_model.gradient_checkpointing_enable()

# Prepare the base model for QLoRA
base_model = prepare_model_for_kbit_training(base_model)

# Get the LoRA trainable version of the model (LoRA adapter)
peft_model = get_peft_model(base_model, config)

# Check the no. of trainable parameters
peft_model.print_trainable_parameters()

trainable params: 5,898,240 || all params: 499,931,008 || trainable%: 1.1798


## 8.2 Define 'TrainingArguments' and Create 'Trainer' Instance

In [44]:
import transformers

# Define the output directory
output_model_name = f'Qwen2.5-Mental-Health-Bot-0.5B-{time.strftime("%Y%m%d")}'
output_dir = f'./{output_model_name}'

# Define the training arguments
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=25, # For the first n steps, learning rate slowly increases
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=2, # evaluation batch size
    gradient_accumulation_steps=4, # Updates model every n batch
    # max_steps=1500, # maximum no. of steps
    num_train_epochs=10,
    learning_rate=2e-5, #(0.00002)
    optim="paged_adamw_8bit", # Optimizer type used to update weights
    logging_steps=10, # Log the loss output every n steps
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=10, # Save model every 500 steps
    # eval_strategy="steps", # evaluation strategy (High GPU RAM)
    # eval_steps = 500, # evaluation steps (High GPU RAM)
    do_eval=True,
    gradient_checkpointing=True,
    report_to="wandb",
    overwrite_output_dir = 'True',
    group_by_length=True,
    # max_eval_samples=1000, # no. of evaluation samples (High GPU RAM)
    # fp16=True,
)

# Disable caching to save memory
peft_model.config.use_cache = False

# Create the 'Trainer' instance
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

## 8.3 Start Training

In [45]:
# To save memory
del base_model
del bnb_config
torch.cuda.empty_cache()

In [46]:
# Start training the model
peft_trainer.train()

# Stop reporting to wandb
wandb.finish()

Step,Training Loss
10,3.08
20,3.0531
30,2.9898
40,2.8501
50,2.759
60,2.6441
70,2.6752
80,2.555
90,2.6238
100,2.6008


0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▂▂▂▁▂▁▂▁▁▁▁▁▂▁▁▁█▁▁▁▁▁▂▁▂▁▁▁▁▁▂▁▂▁
train/learning_rate,▄▇███▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,██▇▅▄▃▄▂▃▃▂▃▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▂▁▂▁▁

0,1
total_flos,3836708377468416.0
train/epoch,9.73759
train/global_step,350.0
train/grad_norm,0.89866
train/learning_rate,0.0
train/loss,2.4754
train_loss,2.58979
train_runtime,1052.8733
train_samples_per_second,5.328
train_steps_per_second,0.332


In [60]:
# Free memory for merging weights
# del peft_trainer
torch.cuda.empty_cache()

# 9.0 Merge Fine Tuned LoRA Adapter to the Base Model

In [61]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Configure Bits and Bytes to load the model in 4-bit (quantized)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, # Load the weights in 4 bit
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype, # Uses 16-bit floating-point (float16)
        bnb_4bit_use_double_quant=True, # Enable double quantization
    )

base_model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)

In [62]:
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [63]:
from peft import PeftModel

final_dir = f'/content/{output_model_name}/checkpoint-350'

# Get the LoRA adapter
ft_model = PeftModel.from_pretrained(base_model, final_dir, torch_dtype=torch.float16, is_trainable=False)

# Merge the LoRA adapter with the base model and save the merged model
lora_merged_model = ft_model.merge_and_unload()

# 10.0 Test the Fine Tuned Model

In [65]:
# Insert prompt
prompt = "Something happened this summer that I cannot forgive myself for. When I think about what happened, I feel ashamed and guilty even though my loved ones forgave me."

# Format the prompt
messages = [
    {"role": "system", "content": "You are a helpful mental health therapist."},
    {"role": "user", "content": prompt}
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Keep text as string
    add_generation_prompt=True # Adds additional instructions (if needed)
)

# Tokenize the text
model_inputs = tokenizer([text], return_tensors="pt").to(lora_merged_model.device)

# Generate response
generated_ids = lora_merged_model.generate(
    **model_inputs,
    max_new_tokens=512,
    temperature=0.1
)

# Get the generated tokens
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

# Decode the tokens into text
ft_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("BASE RESPONSE \n============================================== \n", base_response)
print("\n--------------------------------------------------------------------------------------------------------------------- \n")
print("FINE TUNED RESPONSE \n============================================== \n", ft_response)

BASE RESPONSE 
 I'm here to listen. It's important to acknowledge your feelings and work through them in a safe space. Can you tell me more about what happened? What was the situation like? How did it make you feel? And how does it affect you now?

--------------------------------------------------------------------------------------------------------------------- 

FINE TUNED RESPONSE 
 I'm sorry to hear that you're feeling this way. It's important to remember that everyone makes mistakes and it's okay to feel shame or guilty when we do. However, it's also important to take responsibility for our actions and work on learning from them.

One way to start is by acknowledging your feelings of shame and guilt. You can write down your thoughts and feelings in a journal or talk to someone who understands what you're going through.

Another option is to seek support from friends, family members, or professionals like a therapist. They can provide guidance and help you process your emotions a

# 11.0 Push to HuggingFace

In [66]:
lora_merged_model.save_pretrained("merged",safe_serialization=True)
tokenizer.save_pretrained("merged")

#push merged model to the hub
lora_merged_model.push_to_hub("Qwen2.5-Mental-Health-Bot-0.5B-v1.0") # the name of the model you want
tokenizer.push_to_hub("Qwen2.5-Mental-Health-Bot-0.5B-v1.0")

model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hezronling/Qwen2.5-Mental-Health-Bot-0.5B-v1.0/commit/fffe7ba0bd957634b0dc83e077c3aea25768e9ec', commit_message='Upload tokenizer', commit_description='', oid='fffe7ba0bd957634b0dc83e077c3aea25768e9ec', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hezronling/Qwen2.5-Mental-Health-Bot-0.5B-v1.0', endpoint='https://huggingface.co', repo_type='model', repo_id='hezronling/Qwen2.5-Mental-Health-Bot-0.5B-v1.0'), pr_revision=None, pr_num=None)

In [None]:
from google.colab import runtime
runtime.unassign()