# 1.0 Install Packages and Import Libraries

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m250.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m14.7 MB/s[0m eta [3

In [2]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
from huggingface_hub import login
import wandb
import os

# Login to HuggingFace
# # interpreter_login()

# Login to Huggingface
api_token = "API KEY HERE"
login(token=api_token)

# 2.0 Load Dataset

In [4]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

# Load the dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")

# Convert the dataset to a pandas DataFrame for easier manipulation
df = dataset['train'].to_pandas()

# Remove duplicates in the 'Context' column
df = df.drop_duplicates(subset='Context', keep='first').reset_index(drop=True)

# Convert the cleaned DataFrame back to a Hugging Face Dataset
filtered_dataset = Dataset.from_pandas(df)

# Split the remaining data into train, test, and validation sets
train_testvalid = filtered_dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test+valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)  # 10% test, 10% valid

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']
})

# Check the size of the splits
print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")
print(f"Validation size: {len(dataset['validation'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

Train size: 796
Test size: 100
Validation size: 99


In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 796
    })
    test: Dataset({
        features: ['Context', 'Response'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['Context', 'Response'],
        num_rows: 99
    })
})


In [6]:
print(dataset['train'][0])

{'Context': "My son stole my debit card and lied about it. It's not the first time he has lied. I don't know what to do anymore. I don't know if I should punish him or make him do something. I've tried talking to him and asking if anything was wrong. I have grounded him, but nothing works. What should I do?", 'Response': 'A lot depends on the age of your child, but given that it was a mis-used debit card, I am going to guess he\'s a teen?\xa0 Assuming that, there are a couple of important things to keep in mind...1.\xa0 That he lied about the misuse indicates he knows he was in the wrong.\xa0 That\'s a dreadful feeling - getting caught out- and an impulsive lie may have popped out of his mouth before he thought it through (after all a debit spend is pretty easy to track).\xa0\xa02. Once he lied he stuck with the lie.\xa0 \xa0 Somewhere along the line, even though he knew that you knew he wasn\'t telling the truth, he stuck with his story.\xa0 And as you said, it\'s not the first time h

# 3.0 Configure Bits and Bytes
- allows model to be loaded in quantized/double quantized form
- quantized: represent a given set of numbers with less info (e.g. 1.5 --> 1)


In [7]:
# Ensure the computation uses 16-bit floating-point (reduce memory usage, speed up training)
compute_dtype = getattr(torch, "float16")

# Configure Bits and Bytes to load the model in 4-bit (quantized)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, # Load the weights in 4 bit
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype, # Uses 16-bit floating-point (float16)
        bnb_4bit_use_double_quant=True, # Enable double quantization
    )

# 4.0 Load the Pretrained Model in 4-bit (Quantized)

In [8]:
# Load the pretrained model, 'meta-llama/Llama-3.2-1B-Instruct' required authorization
base_model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
device_map = "auto" #{"": 0}
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config, # To load in 4-bit and double quantization
                                                      trust_remote_code=True,
                                                      use_cache = False,
                                                      use_auth_token=True)



config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

# 5.0 Configure the Tokenizer

In [9]:
# Configure the tokenizer, use left-padding to optimize memory usage during training.
tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                          trust_remote_code=True,
                                          padding_side="left",
                                          add_eos_token=True,
                                          add_bos_token=True,
                                          use_fast=False)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

# 6.0 Test the Base Model's Response

In [10]:
# Insert prompt
prompt = "Something happened this summer that I cannot forgive myself for. When I think about what happened, I feel ashamed and guilty even though my loved ones forgave me."

# Format the prompt
messages = [
    {"role": "system", "content": "You are a helpful mental health therapist."},
    {"role": "user", "content": prompt}
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Keep text as string
    add_generation_prompt=True # Adds additional instructions (if needed)
)

# Tokenize the text
model_inputs = tokenizer([text], return_tensors="pt").to(base_model.device)

# Generate response
generated_ids = base_model.generate(
    **model_inputs,
    max_new_tokens=512
)

# Get the generated tokens
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

# Decode the tokens into text
base_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("BASE MODEL RESPONSE \n============================================== \n", base_response)

BASE MODEL RESPONSE 
 It's natural to feel shame or guilt after an event that has occurred, but it is important to remember that we are human beings and each of us carries our own weight in life. It is also essential to focus on the positive aspects of our lives, such as our skills, talents, and accomplishments.

Here are some steps you can take to move forward:

1. Identify the reasons why you are feeling shame or guilt: Are there specific situations or people who trigger your feelings? What actions did you do to get away from those situations?

2. Practice self-compassion: Treat yourself with the same kindness and understanding you would want others to treat you with. Avoid criticizing yourself too much and instead focus on how you can make progress towards changing your behavior.

3. Set small goals: Take small steps towards achieving your goals, like reading one book every day, taking time out of your schedule to exercise, or practicing gratitude.

4. Seek support: Talk to someone 

# 7.0 Dataset Preprocessing

## 7.1 Define Preprocessing Functions

### 7.1.1 Change Prompt Format in Each Row

In [11]:
from functools import partial

def create_prompt_formats(sample):
    # Define the instruction
    instruction = "Below is a conversation where the user shares a personal experience or feeling, and the response provides guidance, empathy, and actionable advice to support the user's mental health."

    # Format the question and answer into the desired prompt
    ques_prompt = f"<USER_CONTEXT>: {sample['Context']}"
    ans_response = f"<RESPONSE>: {sample['Response']}"

    # Combine the instruction, question, and answer into a single formatted text
    formatted_prompt = f"{instruction}\n{ques_prompt}\n{ans_response}"

    # Add the new 'text' key to the sample dictionary
    sample["text"] = formatted_prompt

    return sample

### 7.1.2 Get the Model's Maximum Token Limit and Tokenize Each Batch


In [12]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
# To get the model's maximum token limit
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

# Tokenize each batch
def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

### 7.1.3 Process Entire Dataset
- combines previous functions
- processes entire dataset

In [13]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
# To process the entire dataset
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset):
    # Add prompt to each sample
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove existing fields 'question' and 'answer'
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['Context', 'Response'],
    )

    # Filter out rows with input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

## 7.2 Start Processing
- process the training and validation dataset

In [14]:
# Set a random seed
# %%time
from transformers import set_seed
seed = 42
set_seed(seed)

# Get the max_length
max_length = get_max_length(base_model)
print(max_length)

# Preprocess the train and validation dataset
train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['train'])
val_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['validation'])

Found max lenth: 32768
32768


Map:   0%|          | 0/796 [00:00<?, ? examples/s]

Map:   0%|          | 0/796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/796 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Filter:   0%|          | 0/99 [00:00<?, ? examples/s]

# 8.0 Save the Processed Datasets

In [15]:
from google.colab import drive
drive.mount('/content/drive')

# Define paths for saving datasets
train_save_path = '/content/drive/My Drive/hf_train_dataset_v4'
val_save_path = '/content/drive/My Drive/hf_val_dataset_v4'

# Save the datasets
train_dataset.save_to_disk(train_save_path)
val_dataset.save_to_disk(val_save_path)

print(f"Train dataset saved to {train_save_path}")
print(f"Validation dataset saved to {val_save_path}")


Mounted at /content/drive


Saving the dataset (0/1 shards):   0%|          | 0/796 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/99 [00:00<?, ? examples/s]

Train dataset saved to /content/drive/My Drive/hf_train_dataset_v4
Validation dataset saved to /content/drive/My Drive/hf_val_dataset_v4


In [16]:
print(train_dataset)

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 796
})
