In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Get started with Llama 3.1 models

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_openai_api_llama3_1.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_openai_api_llama3_1.ipynb"">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_openai_api_llama3_1.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_openai_api_llama3_1.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates how to get started with using the OpenAI library and demonstrates how to use Llama 3.1 models as Model-as-service (MaaS) for building translation chain and document question-answer.

### Objective

- Configure OpenAI SDK for the Llama 3.1 Completions API
- Chat with Llama 3.1 models with different prompts and model parameters
- Build with Llama 3.1 models
  - Translation Chain.
  - A RAG application using LLamaIndex on Vertex AI.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform[langchain] openai
! pip3 install --upgrade --quiet langchain-openai

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "<your-project-id>"  # @param {type:"string"}

LOCATION = "us-central1"  # @param {type:"string"}

### Create a Cloud Storage bucket

Create a storage bucket to store tutorial artifacts.

In [None]:
BUCKET_NAME = "<your-bucket-name>"  # @param {type:"string"}

BUCKET_URI = f"gs://{BUCKET_NAME}"

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### Initialize Vertex AI SDK for Python

In [None]:
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### Import libraries

Import libraries to use in this tutorial.

In [None]:
# Chat completions API
import openai
from google.auth import default, transport
from langchain import PromptTemplate
# Build
from langchain_openai import ChatOpenAI
from vertexai.preview import rag

### Configure OpenAI SDK for the Llama 3.1 Chat Completions API

To configure the OpenAI SDK for the Llama 3.1 Chat Completions API, you need to request the access token and initialize the client pointing to the Llama 3.1 endpoint.


#### Authentication

You can request an access token from the default credentials for the current environment. Note that the access token lives for [1 hour by default](https://cloud.google.com/docs/authentication/token-types#at-lifetime); after expiration, it must be refreshed.


In [None]:
credentials, _ = default()
auth_request = transport.requests.Request()
credentials.refresh(auth_request)

Then configure the OpenAI SDK to point to the Llama 3.1 Chat Completions API endpoint.

Notice, only `us-central1` is supported region for Llama 3.1 models using Model-as-a-Service (MaaS).

In [None]:
MODEL_LOCATION = "us-central1"

client = openai.OpenAI(
    base_url=f"https://{MODEL_LOCATION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{MODEL_LOCATION}/endpoints/openapi/chat/completions?",
    api_key=credentials.token,
)

#### Llama 3.1 models

You can experiment with various supported Llama 3.1 models.

This tutorial use Llama 3 405b using Model-as-a-Service (MaaS) only. Using Model-as-a-Service (MaaS), you can access Llama 3.1 models in just a few clicks without any setup or infrastructure hassles. You can also access Llama models for self-service in Vertex AI Model Garden, allowing you to choose your preferred infrastructure.

[Check out Llama 3 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3?_ga=2.31261500.2048242469.1721714335-1107467625.1721655511) to learn how to deploy a Llama 3.1 models on Vertex AI.

In [None]:
MODEL_ID = "meta/llama3-405b-instruct-maas"  # @param {type:"string"} ["meta/llama3-405b-instruct-maas"]

### Chat with Llama 3.1

Use the Chat Completions API to send a request to the Llama 3.1 model.

#### Hello, Llama 3!

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID, messages=[{"role": "user", "content": "Hello, Llama 3.1!"}]
)

In [None]:
print(response.choices[0].message.content)

#### Ask Llama 3.1 using different model configuration

Use the following parameters to generate different answers:

*   `temperature` to control the randomness of the response
*   `max_tokens` to limit the response length
*   `top_p` to control the quality of the response
*   `stream` to stream the response back or not


In [None]:
temperature = 1.0  # @param {type:"number"}
max_tokens = 50  # @param {type:"integer"}
top_p = 1.0  # @param {type:"number"}
stream = True  # @param {type:"boolean"}

Get the answer.

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {"role": "user", "content": "What is Vertex AI?"},
        {"role": "assistant", "content": "Sure, Vertex AI is:"},
    ],
    temperature=temperature,
    max_tokens=max_tokens,
    top_p=top_p,
    stream=stream,
)

Depending if `stream` parameter is enabled or not, you can print the response entirely or chunk by chunk.

In [None]:
if stream:
    for chunk in response:
        print(chunk.choices[0].delta.content, end="")
else:
    print(response.choices[0].message.content)

#### Use Llama 3.1 with different tasks

In this section, you will use Llama 3.1 to perform different tasks including text generation, text summarization, and code generation.

For each task, you'll define a different prompt and submit a request to the model as you did before.

##### Text Generation

In [None]:
prompt = "Write a poem about a cat who loves to code"

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {"role": "user", "content": prompt},
    ],
)

In [None]:
print(response.choices[0].message.content)

##### Text summarization

In [None]:
article = """
Vertex AI: Google's Unified Platform for Machine Learning

Google Cloud's Vertex AI is a comprehensive platform that simplifies the process of building, deploying, and managing machine learning (ML) models and AI applications. It provides a single environment for all your AI needs, from data preparation to model deployment and monitoring.

Vertex AI offers a range of features to cater to various user levels, including:

AutoML: This feature allows you to train models on tabular, image, text, or video data without writing code. It's ideal for users without extensive ML expertise.
Custom Training: For advanced users, Vertex AI provides custom training options, allowing you to use your preferred ML framework and write your own code.
Model Garden: This feature lets you discover, test, and deploy pre-trained models from Vertex AI and open-source sources.
Generative AI: Access Google's powerful large language models (LLMs) to generate text, code, images, and speech, which can be customized and deployed for your applications.
Vertex AI seamlessly integrates with other Google Cloud services like BigQuery for data warehousing, Cloud Storage for data management, and Cloud AI Platform for custom model training. It provides managed infrastructure that can be tailored to your performance and budget needs.

Whether you're a seasoned data scientist or just starting out with AI, Vertex AI simplifies the entire ML lifecycle and empowers you to build and deploy AI solutions effectively.
"""


prompt = ("Summarize the following article in one sentence: " + article).replace(
    "\n", ""
)

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {"role": "user", "content": prompt},
    ],
)

In [None]:
print(response.choices[0].message.content)

##### Code generation

In [None]:
prompt = "Write a Python function that takes a list of numbers and returns the average. Include error handling for empty lists."

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {"role": "user", "content": prompt},
    ],
)

In [None]:
print(response.choices[0].message.content)

### Build with Llama 3.1

In this section, you use Llama 3.1 to build two simple applications. In order:

1.   **Translation Chain** to translate text across multiple languages using Llama 3.1 and LangChain Expression Language (LCEL).

2.   **Document Q&A with RAG using LLamaIndex on Vertex AI** to answer questions about documents with retrieval augmented generation, powered by Llama 3.1.

#### Translation chain

In this scenario, you use LangChain Expression Language (LCEL) to build a simple chain which translates some `text_to_translate` to the specified `target_language`.

##### Initialize the chat interface and the translation prompt template using LangChain

In [None]:
llm = ChatOpenAI(
    model=MODEL_ID,
    base_url=f"https://{MODEL_LOCATION}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{MODEL_LOCATION}/endpoints/openapi/chat/completions?",
    api_key=credentials.token,
)

template = """Translate the following {text} to {target_language}:"""

prompt = PromptTemplate(input_variables=["text", "target_language"], template=template)

##### Initialize the chain

In [None]:
chain = prompt | llm

##### Translate a text

In [None]:
text_to_translate = "Hello Llama 3.1!"  # @param {type:"string"}
target_language = "Italian"  # @param {type:"string"}

response = chain.invoke({"text": text_to_translate, "target_language": target_language})

In [None]:
print(response.content)

#### Document Q&A using LlamaIndex on Vertex AI for RAG

In this scenario, you'll use LlamaIndex on Vertex AI for RAG to build a document Q&A.

LlamaIndex on Vertex AI helps you with the end-to-end process of building and deploying  context-augmented large language model (LLM) applications including retrieval-augmented generation (RAG), from ingesting data from various sources, to transforming it for indexing, and creating numerical representations (embeddings) for semantic understanding. Then, when a user provides a query, LlamaIndex on Vertex AI retrieves relevant information and uses it as context to generate accurate and relevant responses.

[Refer to the documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/llamaindex-on-vertexai) to find detailed instructions on how to get started with the RAG API with LlamaIndex on Vertex AI.

##### Create a RAG corpus

You'll start by creating an index (i.e., corpus) using one of the supported embedding models, for example `text-embedding-004`.

In [None]:
# Currently supports Google first-party embedding models
embedding_model_config = rag.EmbeddingModelConfig(
    publisher_model="publishers/google/models/text-embedding-004"
)

# Name your corpus
DISPLAY_NAME = "<your-corpus-display-name>"  # @param {type:"string"}

rag_corpus = rag.create_corpus(
    display_name=DISPLAY_NAME, embedding_model_config=embedding_model_config
)

In [None]:
# Check the corpus just created
rag.list_corpora()

##### Import documents to an existing corpus from Google Cloud Storage.

For the purpose of this notebook, you'll create a document of interesting facts about llamas and upload it directly to the newly created corpus.

In [None]:
%%writefile llama_facts.txt
Beyond the Spit: Unpacking the Wondrous World of Llamas
They're the fluffy, doe-eyed denizens of the Andes, known for their luxurious wool and, let's face it, their rather pungent spitting habits. But beyond these surface-level quirks lies a fascinating world of adaptation, intelligence, and surprising utility. Buckle up, folks, because we're diving deep into the wondrous world of llamas!

From Mountain Tops to Your Backyard: Llama Evolution is No Myth
Forget unicorns and dragons, the llama's origin story is where the real magic lies. Evolving in the unforgiving terrains of the North American plains millions of years ago, these camelids (yes, they're related to camels!) migrated southwards, eventually conquering the challenging Andean highlands. Their impressive adaptation to high altitude, with their unique blood composition that efficiently transports oxygen, is a testament to their evolutionary prowess.

Pack Animal Extraordinaire: More Than Just a Pretty Fleece
While their luxurious wool, prized for its warmth and softness, is a major draw, llamas are much more than walking sweaters. For centuries, indigenous communities have relied on these gentle giants as pack animals, capable of carrying impressive loads (up to 100 pounds!) across treacherous mountain paths. Forget horsepower, in the Andes, it's all about llama-power!

Social Butterflies with a Side of Spit: Decoding Llama Communication
Llamas are highly social creatures, living in herds led by a dominant male. Their communication is a fascinating mix of soft hums, alarm calls, and yes, the infamous spit. But here's the kicker: llamas primarily reserve their spitting for each other, often to settle disputes or establish dominance within the herd. So, unless you're challenging a llama to a staring contest (not recommended!), you're unlikely to be on the receiving end of that projectile saliva.

Guardian of the Flock: The Llama's Unexpected Talent
Move over, sheepdogs, there's a new sheriff in town! Farmers have discovered the llama's remarkable ability to protect livestock from predators. Their size, assertive nature, and surprisingly powerful kicks are enough to deter coyotes, foxes, and even stray dogs. This unexpected talent has earned them the title of "Guardian Llamas," a role they take very seriously, patrolling their territory and fiercely defending their woolly companions.

The Future is Fuzzy: Llamas Beyond the Andes
With their gentle nature, intelligence, and surprising versatility, llamas are steadily gaining popularity beyond their traditional Andean home. From therapy animals providing comfort and emotional support to eco-friendly lawnmowers, the possibilities seem endless. So, the next time you encounter a llama, remember, there's more to them than meets the eye (or the spit!). They are a testament to nature's ingenuity, captivating us with their unique charm and leaving us in awe of their remarkable journey through time.

Upload the text file to the RAG corpus.

In [None]:
rag_file = rag.upload_file(
    corpus_name=rag_corpus.name,
    path="llama_facts.txt",
    display_name="llama_facts",
    description="Llama Facts for RAG test",
)

In [None]:
# Check the files just uploaded.
list(rag.list_files(corpus_name=rag_corpus.name))

##### Generate Content

You can provide a query and use RAG API to retrieve relevant content from the corpus and concantenate them as the context of the input prompt to generate accurate and relevant responses with Llama 3.1.

In [None]:
question = "What about llama spitting?"  # @param {type:"string"}

Retrieve relevant content from the corpus and concantenate them as the context.

In [None]:
context = " ".join(
    [
        context.text
        for context in rag.retrieval_query(
            rag_resources=[
                rag.RagResource(
                    rag_corpus=rag_corpus.name,
                )
            ],
            text=question,
            similarity_top_k=1,
            vector_distance_threshold=0.5,
        ).contexts.contexts
    ]
).replace("\n", "")

In [None]:
context

Generate content with the retrieved context.

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {
            "role": "system",
            "content": """You are an AI assistant. Your goal is to answer questions using the pieces of context. If you don't know the answer, say that you don't know.""",
        },
        {"role": "user", "content": question},
        {"role": "assistant", "content": context},
    ],
)

In [None]:
print(response.choices[0].message.content)

For comparison, generate content without the context.

In [None]:
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {
            "role": "system",
            "content": """You are an AI assistant. Your goal is to answer questions using the pieces of context. If you don't know the answer, say that you don't know.""",
        },
        {"role": "user", "content": question},
    ],
)

In [None]:
print(response.choices[0].message.content)

## Cleaning up

Clean up resources created in this notebook.

To delete to the search engine in Vertex AI, check out the following [documentation](https://cloud.google.com/generative-ai-app-builder/docs/delete-engine).

In [None]:
delete_rag_corpus = False  # @param {type:"boolean"}
delete_bucket = False  # @param {type:"boolean"}

if delete_rag_corpus:
    rag_corpus_list = rag.list_corpora()
    for rag_corpus in rag_corpus_list:
        rag.delete_corpus(name=rag_corpus.name)

if delete_bucket:
    ! gsutil rm -r gs://{BUCKET_NAME}