<a href="https://colab.research.google.com/github/IvanBenedictus/mental_health/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!rm -rf mental_health
!git clone https://github.com/IvanBenedictus/mental_health.git

Cloning into 'mental_health'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 79 (delta 35), reused 48 (delta 12), pack-reused 0 (from 0)[K
Receiving objects: 100% (79/79), 8.11 MiB | 13.00 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [2]:
!pip install datasets peft transformers trl



[0m

In [None]:
!pip install trl --upgrade

In [3]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import PeftConfig, PeftModel, LoraConfig, TaskType
from datasets import load_dataset
from trl import DPOTrainer
from tqdm import tqdm

ImportError: cannot import name 'DPOTrainer' from 'trl' (/usr/local/lib/python3.10/dist-packages/trl/__init__.py)

### Mental Mistral SFT on our dataset

In [None]:
# Used to securely store your API key
from google.colab import userdata

HUGGING_FACE_TOKEN=userdata.get('HUGGING_FACE_TOKEN')

In [None]:
from huggingface_hub import login

# Replace with your Hugging Face token
login(HUGGING_FACE_TOKEN)

In [None]:
# Set model and adapter
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
adapter = "GRMenon/mental-health-mistral-7b-instructv0.2-finetuned-V2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model, add_bos_token=True, trust_remote_code=True, padding_side='left')

device = "cuda" if torch.cuda.is_available() else "cpu"
config = PeftConfig.from_pretrained(adapter)

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype='auto')
model = PeftModel.from_pretrained(model, adapter)
model.to(device)

# Reference model for DPO
model_ref = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype='auto')
model_ref.to(device)

In [None]:
def prompt_responses(samples)-> dict[str, str]:
    return {
        "prompt": [
            "<s> [INST] Read the user query and give solutions that can be implemented \n user:" + question + " [/INST] </s>" for question in samples["prompt"]
        ],
        "chosen": samples["chosen"],
        "rejected": samples["rejected"],
    }

def paired_dataset(data_file="data/train.csv", sanity_check=False, cache_dir=None, num_proc=4):
    """
    Loads the dataset from a CSV file, processes it, and optionally performs a sanity check by selecting a subset.
    """
    dataset = load_dataset('csv', data_files=data_file, cache_dir=cache_dir)['train']

    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    processed_dataset = dataset.map(
        prompt_responses,
        batched=True,
        num_proc=num_proc,
        remove_columns=dataset.column_names,
    )

    return processed_dataset

def chars_token_ratio(dataset, tokenizer, nb_examples=None):
    """
    Estimate the average number of characters per token in the dataset.
    If nb_examples is None, use the whole dataset.
    """
    if nb_examples is None:
        nb_examples = len(dataset)
    total_characters, total_tokens = 0, 0
    for example in tqdm(dataset, total=nb_examples):
        text = example['prompt']
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))
    return total_characters / total_tokens

In [None]:
# Prepare datasets
train_dataset = paired_dataset('/content/mental_health/data/train_data.csv')
train_dataset = train_dataset.filter(lambda x: len(x["prompt"]) + len(x["chosen"]) <= 1024 and len(x["prompt"]) + len(x["rejected"]) <= 1024)

eval_dataset = paired_dataset('/content/mental_health/data/train_data.csv')
eval_dataset = eval_dataset.filter(lambda x: len(x["prompt"]) + len(x["chosen"]) <= 1024 and len(x["prompt"]) + len(x["rejected"]) <= 1024)

In [None]:
# Tokenize the dataset


chars_per_token = chars_token_ratio(train_dataset, tokenizer)

train_data = ConstantLengthDataset(
    tokenizer,
    train_dataset,
    formatting_func=prompt_responses,
    infinite=True,
    seq_length=1024,
    chars_per_token=chars_per_token,
)

# Prepare training args and PEFT config
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=10,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=100,
    save_strategy="no",
    logging_steps=1,
    output_dir="new_model/",
    optim="paged_adamw_32bit",
    warmup_steps=10,
    fp16=True,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=20,
)

# PEFT config for LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Create DPO Trainer
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=0.1,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    max_prompt_length=512,
    max_length=1024,
)