In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fine-tuning Mistral-7b on Vertex AI

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/model_garden_pytorch_mistral_peft_tuning.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/model_garden_pytorch_mistral_peft_tuning.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/model_garden_pytorch_mistral_peft_tuning.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

Note that this could be used for any model that supports device_map (i.e. loading the model with accelerate).

# Overview
In this notebook you will learn how to fine tune Mistral-7B with QLoRa.

## Objective

*   **Step 1** Load quantized Mistral-7B model with bnb and run Local inference
*   **Step 2** Fine tune Mistral-7B model with PEFT
    -   Option 1: Finetune and merge Mistral-7B model with peft train docker image (maintained by Vertex AI Model Garden). Optionally run Hyperparameter tuning to find the best parameters.
    -   Option 2: Manually fine-tune Mistral-7B with bnb, peft and SFTTrainer. Merge the LoRA weights with the base Mistral-7B model with peft train.
*   **Step 3** Deploy the finetuned model with vLLM docker image on a Vertex AI Endpoint
*   **Step 4** Run inference to evaluate the finetuned model and compare with initial local inference with the based model
    -   Option 1: Run inference with Merged model
    -   Option 2: Run inference with Adapter model

# Step 0 - Initiatialise the notebook
## Define some helper functions and variables:

0. Define some variables and APIs


In [2]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = ""  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
DATA_BUCKET = os.path.join(EXPERIMENT_BUCKET, "data")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

Updated property [core/project].


## Installation : *Vertex AI API*


In [3]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

## Define *constants*

In [4]:
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231127_0916_RC00"
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240126_0936_RC00"
VLLM_GPTQ_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:gptq"

## Define common functions
1. Define a wrapper function which pass your query to the model for inference and return decoded model's completion(response).

In [5]:
from datetime import datetime


def get_completion(query: str, model, tokenizer) -> str:
    device = "cuda:0"

    prompt_template = """
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  ### Question:
  {query}

  ### Answer:
  """
    prompt = prompt_template.format(query=query)

    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    model_inputs = encoded.to(device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=250,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.batch_decode(generated_ids)
    return decoded[0]

1. Define model deployment functions

In [6]:
from typing import Tuple


def create_name_with_datetime(prefix: str) -> str:
    """Creates a name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


# Can add precision as parameter


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")
    if quantization_method == "gptq":
        vllm_docker_uri = VLLM_GPTQ_DOCKER_URI
    else:
        vllm_docker_uri = VLLM_DOCKER_URI

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vllm_docker_uri,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Install necessary packages
First, install the dependencies below to get started. As these features are available on the main branches only, you need to install the libraries below from source.

In [None]:
# Using BitsAndBytes Library for quantization
!pip install -q -U bitsandbytes

# Transformers provides all API for downloading and working with pre-trained models that are in the HF hub.
!pip install -q -U git+https://github.com/huggingface/transformers.git

# This package provides all the APIs you will need to perform the LoRA technique.
!pip install -q peft==0.6.2

# Powerful Huggingface package, that hides the complexity of the developer trying to write/manage code needed to use multi-GPUs/TPU/fp16.
!pip install -q -U git+https://github.com/huggingface/accelerate.git

! pip3 install sentencepiece==0.1.99

# This Huggingface package provides access to the various datasets in the Huggingface hub.
!pip install -q datasets

# This library provides access to the Weights and Biases library to capture various metrics, during the fine-tuning process.
!pip install -q wandb

# Step 1 - Load quantized Mistral-7B model with bnb and run local inference
We'll load the model using QLoRA quantization to reduce the usage of memory


In [8]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # loading the base model in 4bit quantization. Also need to check model weights in config file.
    bnb_4bit_use_double_quant=True,  # Double Quantization
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # Would use float16 with compute capabilities below 8 (T4, V100)
)

Now you specify the model ID and then you load it with your previously defined quantization configuration.

In [None]:
model_id = "mistralai/Mistral-7B-v0.1"

# Load Mistral-7B quantized with BitsAndBytesConfig defined above.

model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map={"": 0}
)

# Define the tokenizer
# Using AutoTokenizers for creating a tokenizer for Mistral-7B
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Run a inference on the base model. The model does not seem to understand your instruction and gives us a list of questions related to your query.

In [None]:
result = get_completion(query="What is Model Garden?", model=model, tokenizer=tokenizer)
print(result)

# Step 2 - Fine tune Mistral-7B model with PEFT
This section demonstrates how to finetune the Mistral-7b model, merge the finetuned LoRA adapter with the base model

Set the base model id.

In [11]:
base_model_id = "mistralai/Mistral-7B-v0.1"  # @param

## **Option 1** Finetune and merge Mistral-7B model with peft train docker image

### Finetune

Use the Vertex AI SDK to create and run the custom training jobs with Vertex AI Model Garden training images.

This example uses the dataset fredmo/vertexai-qna-500 , a small dataset containing questions and answers about GCP Vertex AI Documentation. You can either use a dataset from huggingface or a custom JSONL dataset in Vertex AI text model dataset format stored in Cloud Storage. The template parameter is optional.

In order to make the finetuning efficient, you enabled quantization for loading pretrained models for finetuning LoRA models. Precision options include "4bit", "8bit", "float16" (default) and "float32", and the precision can be set via "--precision_mode".

In this section, the finetuned LoRA adapter will be saved to a GCS bucket specified by the variable lora_adapter_dir below; and you merge the LoRa adapter with the base model, and save it to a separate GCS bucket specified by merged_model_output_dir below.

### Finetune with a custom dataset

To use a custom dataset, you should supply a gs:// URI to a JSONL file in Vertex text model dataset format in the dataset_name below.

For example, you can download the template file here https://github.com/thomaslemoullec/QLora_vLLM_Mistral7B/blob/main/vertexAI_q%26a_template.json and upload it to your bucket, then reference the gs://

{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}

To use this sample dataset that contains input_text and output_text fields, set dataset_name to gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl and template to vertex_sample. For advanced usage with custom datatset fields, see the template example and supply your own JSON template as gs:// URIs.

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "fredmo/vertexai-qna-500"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

finetuning_precision_mode = "float16"

# Worker pool spec.
# Finetunes mistral-7B with 1 V100 (16G).
machine_type = "n1-highmem-16"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 2

# Finetunes mistral-7B with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

replica_count = 1


# Setup training job.
job_name = create_name_with_datetime("mistral-lora-train")
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = create_name_with_datetime("mistral-lora-adapter")
lora_output_dir = os.path.join(MODEL_BUCKET, lora_adapter_dir)
lora_output_dir_gcsfuse = lora_output_dir.replace("gs://", "/gcs/")

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = create_name_with_datetime("mistral-merged-model")
merged_model_output_dir = os.path.join(MODEL_BUCKET, merged_model_dir)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Pass training arguments and launch job.
train_job.run(
    args=[
        "--task=causal-language-modeling-lora",
        f"--pretrained_model_id={base_model_id}",
        f"--dataset_name={dataset_name}",
        f"--output_dir={lora_output_dir_gcsfuse}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir_gcsfuse}",
        "--lora_rank=16",
        "--lora_alpha=32",
        "--lora_dropout=0.05",
        "--warmup_steps=10",
        "--max_steps=10",
        "--learning_rate=2e-4",
        f"--precision_mode={finetuning_precision_mode}",
        f"--template={template}",
    ],
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
)

print("The finetuned Lora adapter can be found at: ", lora_output_dir)
print(
    "The finetuned Lora adapter merged with the base model can be found at: ",
    merged_model_output_dir,
)

## **Option 2** Finetuned manually Mistral-7B and merge Lora weights afterwards

### Load dataset for finetuning

Let's load a dataset on Vertex

In [None]:
from datasets import load_dataset

data = load_dataset("fredmo/vertexai-qna-500", split="train")  # Full train split

# Explore the data
df = data.to_pandas()
df.head(10)

Instruction Finetuning - Prepare the dataset under the format of "prompt" so the model can better understand :
1. the function generate_prompt : take the instruction and output and generate a prompt
2. shuffle the dataset
3. tokenizer the dataset

In [None]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenized prompt
    """
    text = (
        "Below is an instruction that describes a question. Write a response that "
        "appropriately answer the request.\n\n"
    )
    text += f'### Instruction:\n{data_point["input_text"]}\n\n'
    text += f'### Response:\n{data_point["output_text"]}'
    return text


# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in data]
data = data.add_column("prompt", text_column)

You need to tokenize your data so the model can understand.


In [None]:
data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Split dataset into 90% for training and 10% for testing

In [None]:
data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [None]:
print(test_data)

### Apply Lora  
Here comes the magic with peft! Let's load a PeftModel and specify that you are going to use low-rank adapters (LoRA) using get_peft_model utility function and  the prepare_model_for_kbit_training method from PEFT.

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()  # Discarding intermediate activation values during the forward pass, add computation in backward pass
model = prepare_model_for_kbit_training(model)  # TODO: Explain

In [None]:
print(model)

Use the following function to find out the linear layers for fine tuning.
QLoRA paper : "We find that the most critical LoRA hyperparameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers is required to match full finetuning performance."

In [None]:
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names:  # needed for 16-bit
            lora_module_names.remove("lm_head")
    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

In [None]:
from peft import LoraConfig, get_peft_model

# PEFT library supports various other PEFT methods such as prefix tuning, P-tuning, and Prompt Tuning. etc.
# Since you are using the LoRA method, you are using the LoraConfig class.

lora_config = LoraConfig(
    r=8,  # dimension of the low-rank matrix
    lora_alpha=32,  # adjusts the magnitude of the combined result (base model output + low-rank adaptation)
    target_modules=modules,
    lora_dropout=0.05,  # 5% dropout neuron probability of the LoRA layers. To avoid overfitting.
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(
    f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%"
)

### Run the training!

In [None]:
import wandb
from huggingface_hub import notebook_login

# Log in to HF Hub
notebook_login()

wandb.login()
%env WANDB_PROJECT=python-fine-tuning

Setting the training arguments:
* for the reason of demo, you just ran it for few steps (100) just to showcase how to use this integration with existing tools on the HF ecosystem.

In [None]:
import locale

import transformers

locale.getpreferredencoding = lambda: "UTF-8"

# “Transformer Reinforcement Learning” is used for fine-tuning the transformer model using reinforcement learning.
# You will use your instruction dataset to perform this reinforcement learning and fine-tune the model.
# You will be using SFTrainer object to perform the fine-tuning.

!pip install -q trl

In [None]:
# Some parameters to consider:
# gradient_checkpointing (already enabled on the model). Used to reduce mem by re-computing intermediate activations during backwards instead of storing them all.
# weigth decay : to prevent overfitting by adding penalty to loss function

trainingArgs = transformers.TrainingArguments(
    per_device_train_batch_size=3,  # Batch size per GPU
    gradient_accumulation_steps=4,  # Number of update steps to accumulate the gradient for
    warmup_steps=0.03,
    max_steps=100,
    learning_rate=2e-4,
    logging_steps=1,  # Frequency of logging
    output_dir="outputs",  # Model predictions and checkpoints storage
    optim="paged_adamw_8bit",  # optimizer is responsible for computing the gradient statistics for back propagation. Done in 8-bit to save memory.
    report_to="wandb",
    save_strategy="epoch",  # save after every epoch
)

In [None]:
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token  # TODO: Explain
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,  # TODO: Explain
    dataset_text_field="prompt",  # TODO: Explain
    peft_config=lora_config,
    args=trainingArgs,
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, mlm=False
    ),  # TODO: CLM and not MLM
)

# MLM is a training method used in models like BERT, where some tokens in the input sequence are masked,
# and the model learns to predict the masked tokens based on the surrounding context.
# MLM has the advantage of bidirectional context, allowing the model to consider both past and future tokens when making predictions.
# This approach is especially useful for tasks like text classification, sentiment analysis, and named entity recognition.

Start the training

In [None]:
print("Start the supervised fine tuning of Mistral-7B")

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

print("Done Training")

### Push the Lora Adapter and tokenizer to Hugging face Hub (or GCS)

In [None]:
# stop reporting to wandb
wandb.finish()

# save model
trainer.save_model()
print("Model saved")

# push to hub the LORA adapter
model.push_to_hub("Thomas-lemoullec/mistral_7b_vertexQandA")
tokenizer.push_to_hub("Thomas-lemoullec/mistral_7b_vertexQandA")

### Merge the LORA adapter to the main model with peft train docker image

In [None]:
merge_job_name = create_name_with_datetime(prefix="mistral-peft-merge")

# The base model to be merged upon. It can be a huggingface model id, or a GCS
# path where the base model was stored.
base_model_dir = "mistralai/Mistral-7B-v0.1"  # @param {type:"string"}
# The previously trained LoRA adapter. It needs to be stored in a GCS path.
finetuned_lora_adapter_dir = (
    "Thomas-lemoullec/mistral_7b_vertexQandA"  # "gs://mistral-lora-weights/outputs"
)


print(finetuned_lora_adapter_dir)

# The GCS path to save the merged model
merged_model_output_dir = os.path.join(BUCKET_URI, merge_job_name)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

machine_type = "n1-highmem-16"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 2

# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "command": [],
            "args": [
                "--task=merge-causal-language-model-lora",
                "--merge_model_precision_mode=float16",
                "--pretrained_model_id=%s" % base_model_dir,
                "--finetuned_lora_model_dir=%s" % finetuned_lora_adapter_dir,
                "--merge_base_and_lora_output_dir=%s" % merged_model_output_dir_gcsfuse,
            ],
        },
    }
]

merge_custom_job = aiplatform.CustomJob(
    display_name=merge_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

merge_custom_job.run()

print("The merged model is stored at: ", merged_model_output_dir)

## Step 3 - Deploy the finetuned model with vLLM docker image

This section uploads the model to Model Registry and deploys it on the Endpoint.
The model deployment step will take ~15 minutes to complete.


NOTE: vLLM requires a merged model with the base model and the finetuned LoRA adapter. Based on your business need, if you need the base model and the finetuned LoRA weight to be served separately, please consider using the regular Vertex AI serving instead.

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets V100 to deploy Mistral-7B
# V100 serving has better throughput and latency performance than L4 serving.
machine_type = "n1-highmem-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 2

# Sets L4 to deploy Mistral-7B
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

model_with_peft, endpoint_with_peft = deploy_model_vllm(
    model_name=create_name_with_datetime(prefix="mistral-peft-serve-vllm"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

print("endpoint_name:", endpoint_with_peft.name)

## Cleaning the memory

In [None]:
import gc

import torch

# clear the VRAM


def memory_stats():
    print("allocated:")
    print(torch.cuda.memory_allocated() / 1024**2)
    print("cached:")
    print(torch.cuda.memory_cached() / 1024**2)


memory_stats()

# del trained_model
# del lora_merged_model
# del trainer
# del model
# del tokenizer

torch.cuda.empty_cache()
gc.collect()
memory_stats()

## Step 4 Run inference to evaluate the finetuned model


### **Option 1** Run inference with the Merged finetuned Model (vLLM Vertex endpoint)

In [None]:
instance = {
    "prompt": "What is Model Garden?",
    "n": 1,
    "max_tokens": 250,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_with_peft.predict(instances=[instance])
print(response.predictions[0])

### **Option 2** Run inference with the adapter model hosted in the Hugging Face hub (uploaded in finetuning option 2)

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

Load directly adapters from the Hub using the command below

In [None]:
# Based on your business need, if you need the base model and the finetuned LoRA weight to be served separately

import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "Thomas-lemoullec/mistral_7b_vertexQandA"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_4bit=True,
    device_map=device_map,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

You can then directly use the trained model that you have loaded from the 🤗 Hub for inference as you would do it usually in transformers.

In [None]:
result = get_completion(query="What is Model Garden?", model=model, tokenizer=tokenizer)
print(result)