In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI LLM and streaming prediction

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/llm_streaming_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
        <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official//prediction/llm_streaming_prediction.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
        </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official//prediction/llm_streaming_prediction.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI LLM for making streaming predictions on large language models.

Learn more about [Vertex AI Language Models](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai.language_models.TextGenerationModel#vertexai_language_models_TextGenerationModel_predict_streaming).

### Objective

In this tutorial, you learn how to use Vertex AI LLM to download pretrained LLM model, make predictions and finetuning the model.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI LLM`
- `Vertex AI Prediction`

The steps performed include:

- Load a pretrained text generation model.
- Make a non-streaming prediction
- Load a pretrained text generation model, which supports streaming.
- Make a streaming prediction
- Load a pretrained chat model.
- Do a local interactive chat session.
- Do a batch prediction with a text generation model.
- Do a batch prediction with a text embedding model.

### Model

The pre-trained models used for this tutorial are from Vertex AI LLM repository. The models used are:

- text-bison
- chat-bison
- text-embedding-gecko

### Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI
- Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Installations

Install the packages required for executing this notebook.

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID} --quiet

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

## Text generation

First, you load from Vertex AI LLM, the text-bison model. Once loaded, you make a prediction.

In [None]:
model = TextGenerationModel.from_pretrained("google/text-bison@001")

print(
    model.predict(
        "What is the best recipe for banana bread? Recipe:",
        # "Brainstorm some ideas combining VR and fitness:",
        # Optional:
        # max_output_tokens=128,
        # temperature=0,
        # top_p=1,
        # top_k=5,
    )
)

## Streaming prediction

Next, you load from Vertex AI LLM, a version of the text-bison model that support streaming prediction. Once loaded, you make a prediction. Note, how each iteration that model returns the next successive output in the prediction response.

In [None]:
import datetime

text_generation_model = TextGenerationModel.from_pretrained("text-bison")

print("Start: ", datetime.datetime.now())
for response in text_generation_model.predict_streaming(
    prompt="Count to 100", max_output_tokens=1000
):
    print(datetime.datetime.now())
    print(response)
print("End: ", datetime.datetime.now())

## Chat

Next, you load from Vertex AI LLM, a version of the chat-bison model  Once loaded, you do an interactive chat session.

In [None]:
from vertexai.language_models import ChatModel, InputOutputTextPair

chat_model2 = ChatModel.from_pretrained("google/chat-bison@001")

chat2 = chat_model2.start_chat(
    # Optional:
    context="My name is Ned. You are my personal assistant. My favorite movies are Lord of the Rings and Hobbit.",
    examples=[
        InputOutputTextPair(
            input_text="Who do you work for?",
            output_text="I work for Ned.",
        ),
        InputOutputTextPair(
            input_text="What do I like?",
            output_text="Ned likes watching movies.",
        ),
    ],
)

print(chat2.send_message("Are my favorite movies based on a book series?"))

In [None]:
print(chat2.send_message("When where these books published?"))

## Text embedding

Next, you load from Vertex AI LLM, a version of the text-embedding-gecko model, and then make an embedding request.

In [None]:
from vertexai.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("google/textembedding-gecko@001")
embeddings = model.get_embeddings(["What is life?"])
for embedding in embeddings:
    vector = embedding.values
    print(len(vector))

## Batch prediction

### text-bison model

Now, you do a batch prediction job with a pretrained text-bison model.

In [None]:
dataset = "gs://cloud-samples-data/vertex-ai/prediction/llm/test_table.jsonl"
destination_uri_prefix = f"{BUCKET_URI}/text-bison@001_"
! gsutil cp -r gs://cloud-samples-data/vertex-ai/prediction/llm/text-bison@001_/ {destination_uri_prefix}


from vertexai.language_models import TextGenerationModel

text_generation_model = TextGenerationModel.from_pretrained("text-bison")
batch_job_1 = text_generation_model.batch_predict(
    dataset=dataset,
    destination_uri_prefix=destination_uri_prefix,
    model_parameters={},
)

### text-embedding-gecko model

Next, you do a batch prediction job with a pretrained text-embedding-gecko model.

In [None]:
dataset = "gs://cloud-samples-data/vertex-ai/prediction/llm/embedding_input.jsonl"

destination_uri_prefix = f"{BUCKET_URI}/textembedding-gecko@001_"

from vertexai.preview.language_models import TextEmbeddingModel

text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
batch_job_2 = text_embedding_model.batch_predict(
    dataset=dataset,
    destination_uri_prefix=destination_uri_prefix,
    # Optional:
    model_parameters={},
)

## Tuning

Now, you fine tune with a pretrained text-bison model, and make a prediction with the fine tuned model. Your input data is in JSONL format and stored in a Cloud Storage location.

In [None]:
model3 = TextGenerationModel.from_pretrained("google/text-bison@001")

model3.list_tuned_model_names()

In [None]:
tuning_job = model3.tune_model(
    training_data="gs://cloud-samples-data/vertex-ai/prediction/llm/q_a_train_with_context.jsonl",
    # Optional:
    train_steps=1,
    tuning_job_location="europe-west4",
    tuned_model_location="us-central1",
)

In [None]:
tuned_model = tuning_job.get_tuned_model()

print(tuned_model.predict("Tell me some ideas combining VR and fitness:"))

In [None]:
model3.list_tuned_model_names()

In [None]:
tuned_model4 = model3.get_tuned_model(
    tuned_model_name=model3.list_tuned_model_names()[0]
)

In [None]:
print(tuned_model4.predict("Brainstorm some ideas combining VR and fitness:"))

## Tuning from pandas DataFrame

Now, you fine tune with a pretrained text-bison model, and make a prediction with the fine tuned model. Your input data is an in-memory pandas DataFrame.

In [None]:
import pandas

training_data = pandas.DataFrame(
    data=[
        {"input_text": "Input 1", "output_text": "Output 1"},
        {"input_text": "Input 2", "output_text": "Output 2"},
        {"input_text": "Input 3", "output_text": "Output 3"},
        {"input_text": "Input 4", "output_text": "Output 4"},
        {"input_text": "Input 5", "output_text": "Output 5"},
        {"input_text": "Input 6", "output_text": "Output 6"},
        {"input_text": "Input 7", "output_text": "Output 7"},
        {"input_text": "Input 8", "output_text": "Output 8"},
        {"input_text": "Input 9", "output_text": "Output 9"},
        {"input_text": "Input 10", "output_text": "Output 10"},
    ]
)

training_data

In [None]:
model4 = TextGenerationModel.from_pretrained("google/text-bison@001")

tuning_job = model4.tune_model(
    training_data=training_data,
    # Optional:
    train_steps=10,
    tuning_job_location="europe-west4",
    tuned_model_location="us-central1",
)

In [None]:
tuned_model = tuning_job.get_tuned_model()

print(tuned_model.predict("Tell me some ideas combining VR and fitness:"))

## Cleanup

In [None]:
import os

batch_job_1.delete()
batch_job_2.delete()

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}