In [None]:
# Install Unsloth for new Colab runtimes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other necessary libraries
!pip install -q "transformers>=4.41.0" "datasets>=2.16.1" "trl>=0.8.3"

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-om1hwrsh/unsloth_4757c3a2ae8749f9a872bb254d191d60
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-om1hwrsh/unsloth_4757c3a2ae8749f9a872bb254d191d60
  Resolved https://github.com/unslothai/unsloth.git to commit e6d0877695452b1291ffa17f961397a3a4c21941
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.7.10 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.7.10-py3-none-any.whl.metadata (8.1 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

In [None]:
import torch
from unsloth import FastLanguageModel

# Maximum sequence length to use
max_seq_length = 2048
# The data type for the model. float16 is default, but bfloat16 is better on modern GPUs.
# None means Unsloth will choose the best automatically.
dtype = None
# Set to True to use 4-bit quantization
load_in_4bit = True

# Load the model and tokenizer from Hugging Face
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2b-it-bnb-4bit", # Use Unsloth's pre-quantized model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.9: Fast Gemma patching. Transformers: 4.53.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

# The prompt format we will use for training
# Gemma's instruction-tuned template is ideal for this
prompt_template = """<start_of_turn>user
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn>"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    contexts   = examples["Context"]
    responses  = examples["Response"]
    texts = []
    for context, response in zip(contexts, responses):
        # Format the text with the prompt template and EOS token
        text = prompt_template.format(context, response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# Load the dataset from Hugging Face
dataset = load_dataset("Amod/mental_health_counseling_conversations", split = "train")

# Apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched = True,)

dataset = dataset.shuffle(seed=42)

print("Dataset prepared. Example format:")
print(dataset[0]['text'])

README.md: 0.00B [00:00, ?B/s]

combined_dataset.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

Dataset prepared. Example format:
<start_of_turn>user
I didn't trust my wife when I found out that she had a new guy friend that she was texting and calling. I investigated him before I found out that he was gay and that there was nothing going on. Now all my wife and I do is fight about trust.<end_of_turn>
<start_of_turn>model
Instead of fighting about trust, is it possible for you and your wife to talk with other about areas which upset each of you?Whenever feelings are hurt, knowing what exactly is problematic and being heard and understood by the partner, goes a long way to building trust.These type of discussions are hard to have, and especially for the first time.  A lot of emotions arise and often people lose their conversation focus from this.If you and your wife have a tough time opening up to each other, consider scheduling time with a couples therapist.Just by each of you committing time to invest in the relationship will show faith in the relationship, and this may contribu

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank of the LoRA matrices. A higher rank means more parameters to train.
    lora_alpha = 16, # A scaling factor for the LoRA updates.
    lora_dropout = 0.05, # Dropout probability for LoRA layers.
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Crucial for saving memory
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# Set up training arguments
training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Effective batch size = 2 * 4 = 8
    warmup_steps = 10,
    # Set a small number of steps for a quick demonstration.
    # For a real fine-tune, you might use 100-200 or more.
    max_steps = 100,
    learning_rate = 1e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    seed = 3407,
    output_dir = "outputs",
)

# Create the SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training faster but can be tricky. False is safer.
    args = training_args,
)

# Start the training!
trainer_stats = trainer.train()

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.9 patched 18 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Tokenizing ["text"]:   0%|          | 0/3512 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,512 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammad-2024csit1046[0m ([33mmohammad-2024csit1046-kiet-group-of-institutions[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.2161
2,4.7035
3,4.551
4,4.7494
5,4.3401
6,4.1668
7,4.0455
8,3.8731
9,3.7889
10,3.4338




In [None]:
from transformers import pipeline
import torch

# Use a pipeline for easy text generation
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16)

# A test prompt reflecting a user seeking help
test_prompt = "I don't know how to tell someone how I feel about them. How can I get better at expressing how I feel?"

# Format the prompt using the Gemma template
formatted_prompt = f"<start_of_turn>user\n{test_prompt}<end_of_turn>\n<start_of_turn>model\n"

# Generate a response
outputs = pipe(formatted_prompt, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)
generated_text = outputs[0]['generated_text']

print("--- MODEL RESPONSE ---")
# Clean up and show only the model's generated part
print(generated_text.split("<start_of_turn>model\n")[1])

Device set to use cuda:0


--- MODEL RESPONSE ---
One way to start is to start by doing some work on yourself to understand how you feel.  The first step is to realize what your needs are and what you need from others. In order for others to be able to understand your needs, they need to know your level of self-awareness.  Once you understand your needs, you can start to communicate them to others in a respectful and honest way.  It is important for both you and the other person to be able to grow and learn from being in a couple. If you are not able to communicate your needs, you may find that the other person does not understand your feelings and may not be able to offer the support you need. You will also find that you are not able to grow and learn from being in a relationship.  If you are able to communicate your needs, you will find that you and your partner have a greater chance of understanding each other's feelings and being able to grow and learn from your relationship.  Another way to get better at ex

In [None]:
# Use a pipeline for easy text generation
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16)

# A test prompt reflecting a user seeking help
test_prompt = "I am in my early 20s and I still live with my parents because I can't afford to live alone. My mother says that if I live under her roof I have to follow her rules. She is trying to control my life. What should I do?"

# Format the prompt using the Gemma template
formatted_prompt = f"<start_of_turn>user\n{test_prompt}<end_of_turn>\n<start_of_turn>model\n"

# Generate a response
outputs = pipe(formatted_prompt, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)
generated_text = outputs[0]['generated_text']

print("--- MODEL RESPONSE ---")
# Clean up and show only the model's generated part
print(generated_text.split("<start_of_turn>model\n")[1])

Device set to use cuda:0


--- MODEL RESPONSE ---
It's important to have an open communication with your parents.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      


In [None]:
# Skip the merge step and directly save to GGUF format
# This is the most reliable method for Ollama

# Save in GGUF format with LoRA weights included
model.save_pretrained_gguf(
    "gemma_mental_health",
    tokenizer,
    quantization_method="q4_k_m"
)

print("GGUF file created: gemma_mental_health-unsloth.Q4_K_M.gguf")

# Also try other quantization methods if needed
model.save_pretrained_gguf("gemma_mental_health", tokenizer, quantization_method="q5_k_m")
model.save_pretrained_gguf("gemma_mental_health", tokenizer, quantization_method="q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.9 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 18/18 [00:01<00:00, 17.12it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gemma_mental_health/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gemma_mental_health/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting gemma model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at gemma_mental_health into f16 GGUF format.
The output location will be /content/gemma_mental_health/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gemma_mental_health
INFO:hf-to-gguf:Model architecture: GemmaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-0000

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Conversion completed! Output location: /content/gemma_mental_health/unsloth.Q4_K_M.gguf
GGUF file created: gemma_mental_health-unsloth.Q4_K_M.gguf
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.94 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 18/18 [00:00<00:00, 24.75it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gemma_mental_health/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gemma_mental_health/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at gemma_mental_health into f16 GGUF format.
The output location will be /content/gemma_mental_health/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gemma_mental_health
INFO:hf-to-gguf:Model architecture: GemmaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: l

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Conversion completed! Output location: /content/gemma_mental_health/unsloth.Q5_K_M.gguf
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.93 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 18/18 [00:00<00:00, 26.19it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gemma_mental_health/pytorch_model-00001-of-00002.bin...
Unsloth: Saving gemma_mental_health/pytorch_model-00002-of-00002.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at gemma_mental_health into q8_0 GGUF format.
The output location will be /content/gemma_mental_health/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gemma_mental_health
INFO:hf-to-gguf:Model architecture: GemmaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: l

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: /content/gemma_mental_health/unsloth.Q8_0.gguf


In [None]:
# List all GGUF files created
import os
gguf_files = [f for f in os.listdir('/content/gemma_mental_health/') if f.endswith('.gguf')]
print("Created GGUF files:")
for file in gguf_files:
    size_mb = os.path.getsize(f'/content/gemma_mental_health/{file}') / 1024 / 1024
    print(f"- {file} ({size_mb:.1f} MB)")

Created GGUF files:
- unsloth.Q4_K_M.gguf (1554.7 MB)
- unsloth.Q5_K_M.gguf (1754.4 MB)
- unsloth.F16.gguf (4786.0 MB)
- unsloth.Q8_0.gguf (2545.4 MB)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory
!mkdir -p /content/drive/MyDrive/gemma_mental_health_ollama

# Copy the recommended versions (Q4_K_M and Q8_0)
print("Copying files to Google Drive...")
!cp /content/gemma_mental_health/unsloth.Q4_K_M.gguf /content/drive/MyDrive/gemma_mental_health_ollama/
!cp /content/gemma_mental_health/unsloth.Q8_0.gguf /content/drive/MyDrive/gemma_mental_health_ollama/

# Optional: Copy all versions if you have space
# !cp /content/gemma_mental_health/*.gguf /content/drive/MyDrive/gemma_mental_health_ollama/

print("✓ Files saved to Google Drive!")

Mounted at /content/drive
Copying files to Google Drive...
✓ Files saved to Google Drive!


In [None]:
from google.colab import files

# Download the Q8_0 version (2.5GB)
print("Downloading unsloth.Q8_0.gguf (2.5GB)...")
print("This may take a few minutes due to the file size...")
files.download('/content/gemma_mental_health/unsloth.Q8_0.gguf')

Downloading unsloth.Q8_0.gguf (2.5GB)...
This may take a few minutes due to the file size...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>