In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI LLM and streaming prediction

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official//prediction/llm_streaming_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fllm_streaming_prediction.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/llm_streaming_prediction.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/llm_streaming_prediction.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview


This tutorial demonstrates how to use Vertex AI LLM for making streaming predictions on large language models.

Learn more about [Vertex AI Language Models](https://cloud.google.com/python/docs/reference/aiplatform/latest/vertexai.language_models.TextGenerationModel#vertexai_language_models_TextGenerationModel_predict_streaming).

### Objective

In this tutorial, you learn how to use Vertex AI LLM to download pretrained LLM model, make predictions and finetuning the model.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI LLM`
- `Vertex AI Prediction`

The steps performed include:

- Load a pretrained text generation model.
- Make a non-streaming prediction
- Load a pretrained text generation model, which supports streaming.
- Make a streaming prediction
- Load a pretrained chat model.
- Do a local interactive chat session.
- Do a batch prediction with a text generation model.
- Do a batch prediction with a text embedding model.

### Model

The pre-trained models used for this tutorial are from Vertex AI LLM repository. The models used are:

- text-bison
- chat-bison
- text-embedding-gecko

### Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI
- Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Get Started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                pandas

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

#### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

#### Set Google Cloud project

In [None]:
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Import libraries

In [None]:
from vertexai.preview.language_models import TextGenerationModel

## Text generation

First, you load from Vertex AI LLM, the text-bison model. Once loaded, you make a prediction.

In [None]:
model = TextGenerationModel.from_pretrained("google/text-bison@001")

print(
    model.predict(
        "What is the best recipe for banana bread? Recipe:",
        # "Brainstorm some ideas combining VR and fitness:",
        # Optional:
        # max_output_tokens=128,
        # temperature=0,
        # top_p=1,
        # top_k=5,
    )
)

## Streaming prediction

Next, you load from Vertex AI LLM, a version of the text-bison model that support streaming prediction. Once loaded, you make a prediction. Note, how each iteration that model returns the next successive output in the prediction response.

In [None]:
import datetime

text_generation_model = TextGenerationModel.from_pretrained("text-bison")

print("Start: ", datetime.datetime.now())
for response in text_generation_model.predict_streaming(
    prompt="Count to 100", max_output_tokens=1000
):
    print(datetime.datetime.now())
    print(response)
print("End: ", datetime.datetime.now())

## Chat

Next, you load from Vertex AI LLM, a version of the chat-bison model  Once loaded, you do an interactive chat session.

In [None]:
from vertexai.language_models import ChatModel, InputOutputTextPair

chat_model2 = ChatModel.from_pretrained("google/chat-bison@001")

chat2 = chat_model2.start_chat(
    # Optional:
    context="My name is Ned. You are my personal assistant. My favorite movies are Lord of the Rings and Hobbit.",
    examples=[
        InputOutputTextPair(
            input_text="Who do you work for?",
            output_text="I work for Ned.",
        ),
        InputOutputTextPair(
            input_text="What do I like?",
            output_text="Ned likes watching movies.",
        ),
    ],
)

print(chat2.send_message("Are my favorite movies based on a book series?"))

In [None]:
print(chat2.send_message("When where these books published?"))

## Text embedding

Next, you load from Vertex AI LLM, a version of the text-embedding-gecko model, and then make an embedding request.

In [None]:
from vertexai.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("google/textembedding-gecko@001")
embeddings = model.get_embeddings(["What is life?"])
for embedding in embeddings:
    vector = embedding.values
    print(len(vector))

## Batch prediction

### text-bison model

Now, you do a batch prediction job with a pretrained text-bison model.

In [None]:
dataset = "gs://cloud-samples-data/vertex-ai/prediction/llm/test_table.jsonl"
destination_uri_prefix = f"{BUCKET_URI}/text-bison@001_"
! gsutil cp -r gs://cloud-samples-data/vertex-ai/prediction/llm/text-bison@001_/ {destination_uri_prefix}


from vertexai.language_models import TextGenerationModel

text_generation_model = TextGenerationModel.from_pretrained("text-bison")
batch_job_1 = text_generation_model.batch_predict(
    dataset=dataset,
    destination_uri_prefix=destination_uri_prefix,
    model_parameters={},
)

### text-embedding-gecko model

Next, you do a batch prediction job with a pretrained text-embedding-gecko model.

In [None]:
dataset = "gs://cloud-samples-data/vertex-ai/prediction/llm/embedding_input.jsonl"

destination_uri_prefix = f"{BUCKET_URI}/textembedding-gecko@001_"

from vertexai.preview.language_models import TextEmbeddingModel

text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
batch_job_2 = text_embedding_model.batch_predict(
    dataset=dataset,
    destination_uri_prefix=destination_uri_prefix,
    # Optional:
    model_parameters={},
)

## Tuning

Now, you fine tune with a pretrained text-bison model, and make a prediction with the fine tuned model. Your input data is in JSONL format and stored in a Cloud Storage location.

In [None]:
model3 = TextGenerationModel.from_pretrained("google/text-bison@001")

model3.list_tuned_model_names()

In [None]:
# Tuning model
! gsutil cp gs://cloud-samples-data/vertex-ai/prediction/llm/q_a_train_with_context.jsonl {BUCKET_URI}/q_a_train_with_context.jsonl

tuning_job = model3.tune_model(
    training_data=f"{BUCKET_URI}/q_a_train_with_context.jsonl",
    # Optional:
    train_steps=1,
    tuning_job_location="europe-west4",
    tuned_model_location="us-central1",
)

In [None]:
# Make a prediction with the fine tuned model
tuned_model = tuning_job.get_tuned_model()

print(tuned_model.predict("Tell me some ideas combining VR and fitness:"))

In [None]:
# List tuned model names
model3.list_tuned_model_names()

In [None]:
# Get tuned model
tuned_model4 = model3.get_tuned_model(
    tuned_model_name=model3.list_tuned_model_names()[0]
)

In [None]:
# Make a prediction with the fine tuned model
print(tuned_model4.predict("Brainstorm some ideas combining VR and fitness:"))

## Tuning from pandas DataFrame

Now, you fine tune with a pretrained text-bison model, and make a prediction with the fine tuned model. Your input data is an in-memory pandas DataFrame.

In [None]:
import pandas

training_data = pandas.DataFrame(
    data=[
        {"input_text": "Input 1", "output_text": "Output 1"},
        {"input_text": "Input 2", "output_text": "Output 2"},
        {"input_text": "Input 3", "output_text": "Output 3"},
        {"input_text": "Input 4", "output_text": "Output 4"},
        {"input_text": "Input 5", "output_text": "Output 5"},
        {"input_text": "Input 6", "output_text": "Output 6"},
        {"input_text": "Input 7", "output_text": "Output 7"},
        {"input_text": "Input 8", "output_text": "Output 8"},
        {"input_text": "Input 9", "output_text": "Output 9"},
        {"input_text": "Input 10", "output_text": "Output 10"},
    ]
)

training_data

In [None]:
# Tune pretrained model
model4 = TextGenerationModel.from_pretrained("google/text-bison@001")

tuning_job = model4.tune_model(
    training_data=training_data,
    # Optional:
    train_steps=10,
    tuning_job_location="europe-west4",
    tuned_model_location="us-central1",
)

In [None]:
# Make a prediction with the fine tuned model
tuned_model = tuning_job.get_tuned_model()

print(tuned_model.predict("Tell me some ideas combining VR and fitness:"))

## Cleanup

In [None]:
delete_bucket = False

# Delete job
batch_job_1.delete()
batch_job_2.delete()

if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI