In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fine-tuning GPT OSS 20B with Unsloth omn Vertex AI Colab Enterprise and Nvidia A100 40GB GPU

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Ffine-tuning%2Fgpt_oss_20B_finetuning_with_unsloth.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/fine-tuning/gpt_oss_20B_finetuning_with_unsloth.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Fred Molina](https://github.com/mltuto) |

## Overview

This notebook provides a step-by-step guide to fine-tuning the GPT-OSS 20B model using Unsloth.
  The process involves:

      01. Installing the required libraries.
      02. Loading the GPT-OSS 20B model.
      03. Adding LoRA adapters to the model for fine-tuning.
      04. Preparing the dataset for fine-tuning.
      05. Fine-tuning the model on the dataset.

 **DISCLAIMER**
  This notebook is intended for educational purposes only.

  - Date: Aug 2025
  - Not suitable for production environments.
  - Use at your own risk.
  - This notebook is an adaptation of the original Unsloth team Notebook that runs on Colab public with T4 GPUS: https://docs.unsloth.ai/get-started/unsloth-notebooks all credits to them!
  Some minor changes were done in how to install the required packages as Vertex AI Colab Enterprise manages the environments differently than local or Colab public
  
Requirements:
A Vertex AI colab enterprise environment running on a Runtime that have a GPU (e.g., NVIDIA A100).

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "global")

from google import genai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
from IPython.display import Markdown, display
# 1. Upgrade uv, the fast package installer
!pip install --upgrade -qqq uv
# 2. Use uv to install all packages in a single, consolidated command.
#    THIS VERSION FORCES NUMPY to a version < 2.0 to solve the TensorFlow conflict.
print("⏳ Installing all required libraries with NumPy compatibility fix...")
!uv pip install --system --upgrade \
    "numpy<2.0" \
    "torch>=2.8.0" \
    "triton>=3.4.0" \
    "torchvision==0.23.0" \
    "bitsandbytes==0.46.1" \
    "unsloth @ git+https://github.com/unslothai/unsloth.git@79b46f71b249600488842511c9ee40f27a3989f2" \
    "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@26615eb3021b92abbfc8f895da4cd6803322b658" \
    "peft @ git+https://github.com/huggingface/peft.git@a90003f0edd6353f489f48bd2c35080d27bb6974" \
    "accelerate @ git+https://github.com/huggingface/accelerate.git@23cf4ef8a3b58f016f63eeb158b4aa2c3e79fe6f" \
    "transformers @ git+https://github.com/huggingface/transformers.git@f4d57f2f0cdff0f63ee74a1f16f442dfaf525231" \
    "protobuf<=3.20.3" \
    "setuptools==69.5.1" \
    "wandb==0.21.1"
print("✅✅✅ Installation complete!")

In [None]:
# Restart Notebook Kernel
import os

os.kill(os.getpid(), 9)

##OpenAI GPT-OSS 20B finetuning on Vertex AI Colab Enterprise with Unsloth!

In [None]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 1024
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gpt-oss-20b",
    dtype=dtype,  # None for auto detection
    max_seq_length=max_seq_length,  # Choose any for long context!
    load_in_4bit=True,  # 4 bit quantization to reduce memory
    full_finetuning=False,  # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

###Data Prep

The HuggingFaceH4/Multilingual-Thinking dataset will be utilized as our example. This dataset, available on Hugging Face, contains reasoning chain-of-thought examples derived from user questions that have been translated from English into four other languages. It is also the same dataset referenced in OpenAI's cookbook for fine-tuning. The purpose of using this dataset is to enable the model to learn and develop reasoning capabilities in these four distinct languages.

In [None]:
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize=False, add_generation_prompt=False
        )
        for convo in convos
    ]
    return {
        "text": texts,
    }


from datasets import load_dataset

dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
dataset

To format our dataset, we will apply our version of the GPT OSS prompt

In [None]:
from unsloth.chat_templates import standardize_sharegpt

dataset = standardize_sharegpt(dataset)
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

Let's take a look at the dataset, and check what the 1st example shows

In [None]:
print(dataset[0]["text"])

What is unique about GPT-OSS is that it uses OpenAI Harmony format which supports conversation structures, reasoning output, and tool calling.

### Train the model

Now let's use Huggingface TRL's SFTTrainer! More docs here: TRL SFT docs. We do 30 steps to speed things up, but you can set num_train_epochs=1 for a full run, and turn off max_steps=None.

In [None]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=30,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

# Show current memory stats

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

# Show final memory and time stats

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that can solve mathematical problems.",
    },
    {"role": "user", "content": "Solve x^5 + 3x^4 - 10 = 3."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="medium",
).to(model.device)
from transformers import TextStreamer

_ = model.generate(**inputs, max_new_tokens=128, streamer=TextStreamer(tokenizer))

We just saw how to Fine Tune GPT - OSS 20B with an A100 40GB on Vertex AI Colab Enterprise using Unsloth. Unsloth has a Discord channel If you like Unsloth optimizations, show your support and ⭐️ Star Unsloth on Github ⭐️