In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# ==========================================================================================
#
#  NOTEBOOK FOR FINE-TUNING GPT-OSS- 20B
#
# ------------------------------------------------------------------------------------------
#
#  **DISCLAIMER**
#
#  This notebook is intended for educational purposes only.
#
#  - Date: Aug 2025
#  - Not suitable for production environments.
#  - Use at your own risk.
#  - This notebook is an adaptation of the original Unsloth team Notebook that runs on Colab public with T4 GPUS: https://docs.unsloth.ai/get-started/unsloth-notebooks all credits to them!
#    Some minor changes were done in how to install the required packages as Vertex AI Colab Enterprise manages the environements differently than local or Colab public environement
#
# ==========================================================================================
#
#  Overview:
#
#  This notebook provides a step-by-step guide to fine-tuning the GPT-OSS 20B model using Unsloth.
#  The process involves:
#
#      01. Installing the required libraries.
#      02. Loading the GPT-OSS 20B model.
#      03. Adding LoRA adapters to the model for fine-tuning.
#      04. Preparing the dataset for fine-tuning.
#      05. Fine-tuning the model on the dataset.
#
# ------------------------------------------------------------------------------------------
#
#  Requirements:
#  - A Vertex AI colab enterprise environment running on a Runtimes that have a  GPU (e.g., NVIDIA A100).
#
# ==========================================================================================

Fine Tune GPT-OSS 20B with a A100 40GB using Vertex AI Colab Enterprise
<br/>
To run this, press "*Runtime*" and press "*Run all*" on a Vertex AI Colab Enterprise with a Runtime that has a A100 40GB!
<br/>
Thanks to:
<div class="align-center">
<a href="https://unsloth.ai/"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
<a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
<a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a>Feel free to join their Discord if you need help and  ⭐ <i>Star Unsloth <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐ <br/ >it helps suporting their efforts!
</div>

### Installation

In [1]:
# 1. Upgrade uv, the fast package installer
!pip install --upgrade -qqq uv

!pip install --upgrade setuptools==69.5.1

# 2. Use uv to install all packages in a single, consolidated command.
#    THIS VERSION FORCES NUMPY to a version < 2.0 to solve the TensorFlow conflict.
print("⏳ Installing all required libraries with NumPy compatibility fix...")
!uv pip install --system --upgrade \
    "numpy<2.0" \
    "torch>=2.8.0" \
    "triton>=3.4.0" \
    torchvision \
    bitsandbytes \
    "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
    "peft @ git+https://github.com/huggingface/peft.git" \
    "accelerate @ git+https://github.com/huggingface/accelerate.git" \
    "transformers @ git+https://github.com/huggingface/transformers.git" \
    "protobuf<=3.20.3" \
    "wandb"

print("\n✅✅✅ Installation complete! The NumPy conflict has been resolved.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mnvidia-nccl-cu12        [0m [32m-[2m-----------------------------[0m[0m 6.87 MiB/307.43 MiB
[2K[23A   [36m[1mBuilding[0m[39m transformers[2m @ git+https://github.com/huggingface/transformers.git@f4d57
      [32m[1mBuilt[0m[39m accelerate[2m @ git+https://github.com/huggingface/accelerate.git@23cf4ef8a
      [32m[1mBuilt[0m[39m unsloth-zoo[2m @ git+https://github.com/unslothai/unsloth-zoo@26615eb3021b9
[37m⠼[0m [2mPreparing packages...[0m (56/76)
[2msympy                   [0m [32m-----------------[2m-------------[0m[0m 3.24 MiB/6.01 MiB
[2mpillow                  [0m [32m------------------------------[2m[0m[0m 6.32 MiB/6.32 MiB
[2mtorchvision             [0m [32m---------------------------[2m---[0m[0m 7.24 MiB/8.23 MiB
[2mnvidia-cuda-cupti-cu12  [0m [32m----------------------[2m--------[0m[0m 7.07 MiB/9.77 MiB
[2mpandas                  [0m [32m-------------------[

In [None]:
#Install the compatible version of setuptools
!pip install "setuptools==69.5.1" --force-reinstall

In [None]:
# Restart Notebook Kernel
import os
os.kill(os.getpid(), 9)

### OpenAI GPT-OSS 20B finetuning on Vertex AI Colab Enterprise with Unsloth!



In [1]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024
dtype = None

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # 20B model using bitsandbytes 4bit quantization
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b", # 20B model using MXFP4 format
    "unsloth/gpt-oss-120b",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b",
    dtype = dtype, # None for auto detection
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True, # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.4: Fast Gpt_Oss patching. Transformers: 4.56.0.dev0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


<a name="Data"></a>
### Data Prep

The `HuggingFaceH4/Multilingual-Thinking` dataset will be utilized as our example. This dataset, available on Hugging Face, contains reasoning chain-of-thought examples derived from user questions that have been translated from English into four other languages. It is also the same dataset referenced in OpenAI's [cookbook](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers) for fine-tuning. The purpose of using this dataset is to enable the model to learn and develop reasoning capabilities in these four distinct languages.

In [3]:
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
dataset

Dataset({
    features: ['reasoning_language', 'developer', 'user', 'analysis', 'final', 'messages'],
    num_rows: 1000
})

To format our dataset, we will apply our version of the GPT OSS prompt

In [4]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Let's take a look at the dataset, and check what the 1st example shows

In [5]:
print(dataset[0]['text'])

<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-08-09

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>developer<|message|># Instructions

reasoning language: French

You are an AI chatbot with a lively and energetic personality.<|end|><|start|>user<|message|>Can you show me the latest trends on Twitter right now?<|end|><|start|>assistant<|channel|>analysis<|message|>D'accord, l'utilisateur demande les tendances Twitter les plus récentes. Tout d'abord, je dois vérifier si j'ai accès à des données en temps réel. Étant donné que je ne peux pas naviguer sur Internet ou accéder directement à l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils généraux sur la façon de les trouver.

Je devrais préciser que les 

What is unique about GPT-OSS is that it uses OpenAI [Harmony](https://github.com/openai/harmony) format which support conversation structures, reasoning output, and tool calling.

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 30 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [6]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [7]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.495 GB.
19.354 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 3,981,312 of 20,918,738,496 (0.02% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1283
2,2.891
3,2.4719
4,2.2563
5,1.9805
6,2.1168
7,1.7968
8,1.6571
9,1.8773
10,1.6982


In [9]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

449.0342 seconds used for training.
7.48 minutes used for training.
Peak reserved memory = 19.354 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 49.004 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
# on T4 GPUS
# 851.6585 seconds used for training.
# 14.19 minutes used for training.
# Peak reserved memory = 12.842 GB.
# Peak reserved memory for training = 0.781 GB.
# Peak reserved memory % of max memory = 87.118 %.
# Peak reserved memory for training % of max memory = 5.298 %.

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [10]:
messages = [
    {"role": "system", "content": "You are a helpful assistant that can solve mathematical problems."},
    {"role": "user", "content": "Solve x^5 + 3x^4 - 10 = 3."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
    return_dict = True,
    reasoning_effort = "medium",
).to(model.device)
from transformers import TextStreamer
_ = model.generate(**inputs, max_new_tokens = 128, streamer = TextStreamer(tokenizer))

<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-08-09

Reasoning: medium

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>developer<|message|># Instructions

You are a helpful assistant that can solve mathematical problems.<|end|><|start|>user<|message|>Solve x^5 + 3x^4 - 10 = 3.<|end|><|start|>assistant<|channel|>analysis<|message|>The user says: Solve x^5 + 3x^4 - 10 = 3.

We need to solve for x. Likely the request is to find roots of the equation x^5 + 3x^4 - 10 = 3. So let's rewrite the equation: x^5 + 3x^4 - 10 = 3 => x^5 + 3x^4 - 10 - 3 = 0 => x^5 + 3x^4 - 13 = 0.

This is a fifth-degree equation, which generally cannot be solved exactly


We Just saw how to Fine Tune GPT - OSS 20B with a A100 40GB on Vertex AI Colab Enterprise using Unsloth.
Unsloth have a [Discord](https://discord.gg/unsloth) channel
If you like Unsloth Optimizations, show your support and  ⭐️ <i>Star Unsloth on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
