In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Handling Reasoning with MaaS Models on Vertex AI using vLLM

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fget_started_with_oss_maas_reasoning_open_ai_sdk.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_oss_maas_reasoning_open_ai_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Ivan Nardini](https://github.com/inardini)|

## Overview

This tutorial explains how to use reasoning capabilities with Model-as-a-Service (MaaS) models on Google Cloud Vertex AI. These models leverage vLLM's reasoning infrastructure to provide both reasoning steps and final conclusions in their outputs.

## What is Reasoning in LLMs?

Reasoning models like DeepSeek R1 are trained to go through a "thinking process" before providing an answer. Depending on the inference framework you use, they usually return:

- **`reasoning_content`**: The internal reasoning steps that led to the conclusion  
- **`content`**: The final answer or conclusion

## Supported MaaS Models on Vertex AI

Currently, Vertex AI supports the following reasoning-capable MaaS models:

| Model | Model ID | Location | Reasoning Support |
| :---- | :---- | :---- | :---- |
| DeepSeek R1 | `deepseek-ai/deepseek-r1-0528-maas` | us-central1 | ✅ Full reasoning |
| DeepSeek v3.1 | `deepseek-ai/deepseek-v3.1-maas` | us-west2 | ✅ With thinking parameter |
| GPT-OSS 20B | `openai/gpt-oss-20b-maas` | us-central1 | ✅ Full reasoning |

### Install required packages


In [None]:
%pip install --upgrade --quiet openai

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

# fmt: off
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
# fmt: on
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "global")

### Import libraries

Let's import our dependencies. We're grabbing google.auth to handle the login and, of course, the `openai` library that we'll use to actually build and send our prompts.

In [None]:
import openai
from google.auth import default
from google.auth.transport.requests import Request

### Set Up Authentication

Here's where we actually grab the auth token. `default()` finds the credentials you provided earlier, and `credentials.refresh()` trades them in for a fresh, ready-to-use access token.

This token is our ticket for accessing the Vertex AI API.

In [None]:
# Get default credentials
credentials, project = default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
request = Request()
credentials.refresh(request)

## Basic Usage

### Initialize the OpenAI Client

We're creating an instance of the `openai` client and we're setting the `base_url` to our own Vertex AI endpoint. Then, for the `api_key`, we just pass in that Google Cloud access token we just generated. And just like that, the openai SDK is now a client for Vertex AI.


In [None]:
MODEL_LOCATION = "us-central1"

# Configure the Vertex AI endpoint
vertex_endpoint_url = f"https://{MODEL_LOCATION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{MODEL_LOCATION}/endpoints/openapi"

# Create OpenAI client with Vertex AI endpoint
client = openai.OpenAI(
    base_url=vertex_endpoint_url,
    api_key=credentials.token,
)

### Make a Request with Reasoning

Time to ask our first question! We're calling the deepseek-r1 model and asking it to "Think step by step."

Check out the response. The model literally shows its work inside <think> tags before giving the final answer. This is great for debugging or for building apps where you want to show users how the AI got to its conclusion.

In [None]:
response = client.chat.completions.create(
    model="deepseek-ai/deepseek-r1-0528-maas",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {
            "role": "user",
            "content": "How many r's are in strawberry? Think step by step",
        },
    ],
    stream=False,
)

response

As you saw, DeepSeek model gives us everything in one big string. It is up to you to unpack that. It's just some simple Python string-slicing to pull out the text between the `<think>` tags. This lets you separate the "behind-the-scenes" reasoning from the clean, final answer you'd show to a user.

> **Note**: You will see later that, for GPT-OSS family, reasoning & final response are populate in `reasoning_content` and `content` respectively.

In [None]:
response_text = response.choices[0].message.content

start_tag = "<think>"
end_tag = "</think>"

start_index = response_text.find(start_tag)
end_index = response_text.find(end_tag)

reasoning_content = ""
if start_index != -1 and end_index != -1 and start_index < end_index:
    reasoning_content = response_text[start_index + len(start_tag) : end_index].strip()
    print("Extracted Reasoning:\n", reasoning_content)
else:
    print("No reasoning content found in the expected format.")

# The rest of the content after </think>
final_answer = response_text[end_index + len(end_tag) :].strip()
print("\nExtracted Final Answer:\n", final_answer)

## Advanced Features

### Controlling Reasoning Behavior

At this point, we know how the model can think but we can be more explicit about wanting it to think.

To achieve that, we pass an `extra_body` parameter with `{"thinking": True}`. It's a good way to be sure the model knows you want to see its thought process.


#### For DeepSeek Models

Use the `thinking` parameter to enable/disable reasoning.

In [None]:
MODEL_LOCATION = "us-west2"

# Configure the Vertex AI endpoint
vertex_endpoint_url = (
    f"https://{MODEL_LOCATION}-aiplatform.googleapis.com/v1beta1/"
    f"projects/{PROJECT_ID}/locations/{MODEL_LOCATION}/endpoints/openapi"
)

# Create OpenAI client with Vertex AI endpoint
client = openai.OpenAI(
    base_url=vertex_endpoint_url,
    api_key=credentials.token,
)

In [None]:
# Enable reasoning explicitly
response = client.chat.completions.create(
    model="deepseek-ai/deepseek-v3.1-maas",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {
            "role": "user",
            "content": "How many r's are in strawberry? Think step by step",
        },
    ],
    extra_body={"chat_template_kwargs": {"thinking": True}},
)


response

#### For GPT-OSS Models

Now let's chat with the GPT-OSS model. This one is slightly different.  It has a `reasoning_effort` parameter you can set to control reasoning depth.

Also the response object returns separated thoughts into a `reasoning_content` field and the final answer into content.

In [None]:
MODEL_LOCATION = "us-central1"

# Configure the Vertex AI endpoint
vertex_endpoint_url = (
    f"https://{MODEL_LOCATION}-aiplatform.googleapis.com/v1beta1/"
    f"projects/{PROJECT_ID}/locations/{MODEL_LOCATION}/endpoints/openapi"
)

# Create OpenAI client with Vertex AI endpoint
client = openai.OpenAI(
    base_url=vertex_endpoint_url,
    api_key=credentials.token,
)

In [None]:
# Enable reasoning explicitly
response = client.chat.completions.create(
    model="openai/gpt-oss-20b-maas",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {
            "role": "user",
            "content": "How many r's are in strawberry? Think step by step",
        },
    ],
    reasoning_effort="high",  # Options: "low", "medium", "high"
)


response

### Streaming Responses with Reasoning

By setting `stream=True`, you get the response back token-by-token.

This example below loops through the chunks as they arrive and prints them out. You can see the model "thinking" in real-time as the `reasoning_content` streams in, followed by the final answer.

In [None]:
stream = client.chat.completions.create(
    model="openai/gpt-oss-20b-maas",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {
            "role": "user",
            "content": "How many r's are in strawberry? Think step by step",
        },
    ],
    stream=True,
)

reasoning_content = ""
final_content = ""

for chunk in stream:
    # Add a check for chunk.choices and chunk.choices[0]
    if chunk.choices and chunk.choices[0].delta:
        # Check for reasoning content
        if hasattr(chunk.choices[0].delta, "reasoning_content"):
            reasoning_chunk = chunk.choices[0].delta.reasoning_content
            if reasoning_chunk:
                reasoning_content += reasoning_chunk
                print(f"\nThinking: {reasoning_chunk}", end="", flush=True)

        # Check for final content
        elif hasattr(chunk.choices[0].delta, "content"):
            content_chunk = chunk.choices[0].delta.content
            if content_chunk:
                final_content += content_chunk
                print(f"\nAnswer: {content_chunk}", end="", flush=True)

### Structured Output with Reasoning

How many times have you asked an LLM for JSON and gotten back a broken string? With `guided_json`, you define a Pydantic schema, pass it to the model, and it's guaranteed to return valid JSON that matches that schema.

And you still get the `reasoning_content` to see how it came up with the JSON fields and values.


In [None]:
from pydantic import BaseModel

class Person(BaseModel):
    name: str
    age: int


json_schema = Person.model_json_schema()

response = client.chat.completions.create(
    model="openai/gpt-oss-20b-maas",
    messages=[{"role": "user", "content": "Generate a JSON with a person's details"}],
    extra_body={"guided_json": json_schema},
)

print("Reasoning:", response.choices[0].message.reasoning_content)
print("JSON Output:", response.choices[0].message.content)

## Complete Example: Comparing Models

Let's wrap it up with a model showdown that brings everything together. It loops through all the models we've looked at, makes sure to set up the client for the correct region for each one, and asks them all the same question.

It's a perfect template for running your own model evaluations to see which one works best for your specific use case.

In [None]:
# Test different models
models = [
    {"id": "deepseek-ai/deepseek-r1-0528-maas", "location": "us-central1"},
    {"id": "deepseek-ai/deepseek-v3.1-maas", "location": "us-west2"},
    {"id": "openai/gpt-oss-20b-maas", "location": "us-central1"},
]

question = "What is 9.11 vs 9.8? Which is greater?"

for model_info in models:
    # Create client for each model's location
    vertex_endpoint_url = (
        f"https://{model_info['location']}-aiplatform.googleapis.com/v1beta1/"
        f"projects/{PROJECT_ID}/locations/{model_info['location']}/endpoints/openapi"
    )

    client = openai.OpenAI(
        base_url=vertex_endpoint_url,
        api_key=credentials.token,
    )

    # Make request
    response = client.chat.completions.create(
        model=model_info["id"],
        messages=[
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": question},
        ],
        stream=False,
    )

    print(f"\n{'=' * 50}")
    print(f"Model: {model_info['id']}")
    print(f"{'=' * 50}")

    # Check if reasoning_content exists before accessing it
    if (
        hasattr(response.choices[0].message, "reasoning_content")
        and response.choices[0].message.reasoning_content
    ):
        print(f"Reasoning: {response.choices[0].message.reasoning_content[:200]}...")
    print(f"Answer: {response.choices[0].message.content}")

## Best Practices

### Token Management

- Reasoning content adds to token usage  
- Monitor `response.usage` for token counts  
- Consider disabling reasoning for simple queries

### Error Handling

```py
try:
    response = client.chat.completions.create(...)
except Exception as e:
    # Refresh token if expired
    credentials.refresh(request)
    client.api_key = credentials.token
    response = client.chat.completions.create(...)
```

### Performance Optimization

- Use streaming for long reasoning chains  
- Cache credentials to avoid repeated authentication  
- Choose appropriate model locations for latency

### 4. When to Use Reasoning

✅ **Use reasoning for:**

- Complex problem-solving  
- Mathematical calculations  
- Step-by-step analysis  
- Debugging assistance

❌ **Skip reasoning for:**

- Simple factual queries  
- Quick responses  
- High-volume requests


## Conclusion

MaaS models on Vertex AI with vLLM reasoning provide powerful capabilities for transparent AI decision-making. By understanding how to properly configure and use these models, you can build applications that not only provide answers but also explain their reasoning process.

## Additional Resources

- [Vertex AI Open models documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/maas/use-open-models)  
- [vLLM Reasoning Documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#reasoning-with-llms)  
- [OpenAI Python SDK](https://github.com/openai/openai-python)
