In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Deepspeed-chat

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_deepspeed_chat.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_deepspeed_chat.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_deepspeed_chat.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a>
    (a Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates training [DeepSpeed-Chat](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat) with [OPT model](https://github.com/facebookresearch/metaseq) using [RLHF](https://arxiv.org/abs/2203.02155) and deploying it on Vertex AI for online prediction.

### Objective

- Train the deepspeed-chat with three steps of RLHF using [Vertex AI custom training](https://cloud.google.com/vertex-ai/docs/training/overview) to generate a ChatGPT-like model .
- Upload the model to [Vertex AI Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).
- Deploy the model to a [Vertex AI Endpoint resource](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run online predictions to serve the chatbot model.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Setup environment

**NOTE**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

### Colab only

In [None]:
!pip3 install --upgrade google-cloud-aiplatform

In [None]:
from google.colab import auth as google_auth

google_auth.authenticate_user()

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

Fill following variables for experiments environment:

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = "us-central1"  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output. Fill it without the 'gs://' prefix.
GCS_BUCKET = ""  # @param {type:"string"}

# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

Initialize Vertex-AI API:

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

### Define constants

In [None]:
# TODO(b/315894399): The train docker has been removed. If this notebook is
# going to be published, go through OSS legal review, and push the docker image
# to vertex-ai.

# The pre-built training docker image. It contains training scripts and models.
TRAIN_DOCKER_URI = ""

# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/pytorch-deepspeed-chat-serve"

### Define common functions

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

import os
import re
from datetime import datetime


def create_job_name(prefix):
    user = os.environ.get("USER")
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    job_name = f"{user}-{prefix}-{now}"
    return job_name


def deploy_model(model_id, task):
    model_name = "deepspeed-chat"
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-{task}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
    }
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/deepspeed_chat_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type="n1-standard-8",
        accelerator_type="NVIDIA_TESLA_V100",
        accelerator_count=1,
        deploy_request_timeout=1800,
        service_account=SERVICE_ACCOUNT,
    )
    return model, endpoint


class Chatbot:
    def __init__(self, endpoint):
        self.endpoint = endpoint
        self.clear_context()

    def clear_context(self):
        self.last_response = ""
        self.num_rounds = 0

    def send_request(self, prompt):
        instances = [{"prompt": prompt}]
        response = endpoint.predict(instances=instances).predictions[0]
        return response

    def talk(self, prompt, with_context=True):
        if with_context is False:
            self.clear_context()

        prompt = self.last_response + f"Human: {prompt}\n Assistant: "
        response = self.send_request(prompt)
        self.num_rounds += 1

        question_poses = [m.start() for m in re.finditer("Human: ", response)]
        last_question_pos = -1
        if len(question_poses) > self.num_rounds:
            last_question_pos = question_poses[self.num_rounds]
        if last_question_pos != -1:
            response = response[0:last_question_pos]
        self.last_response = response + "\n\n"
        return response

## Fine tune model

This section trains DeepSpeed-Chat with RLHF all three steps:

**Step-1: Supervised fine tuning**

This step fine tunes the actor model through a supervised training. It is very similar to standard language model finetuning on casual language tasks. The main difference is from the dataset resources, SFT will collect high-quality query-answer pairs to finetune the model for human-perferred generation. See [here](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md) for the detailed settings of this step.

**Step-2: Reward model fine tuning**

This step is similar to Step-1 Supervised Fine-Tuning (SFT) finetuning. However, there are several key differences between RM and SFT finetuning, like training data difference, training objective difference, etc. See [here](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md) for the detailed settings of this step.

**Step-3: Reinforcement Learning with Human Feedback**

This step uses RLHF to fine tune the actor model further more. See [here](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/README.md) for the detailed settings of this step. 

### Run three steps with default settings

This example run the official [train.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/train.py) script to launch the job. It takes [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b) as the actor model and [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) as the reward model. It completes the RLHF three steps on a single A100-40GB GPU in 7 hours. The training datasets are:
- Dahoas/rm-static
- Dahoas/full-hh-rlhf
- Dahoas/synthetic-instruct-gptj-pairwise
- yitingxie/rlhf-reward-datasets
- openai/webgpt_comparisons stanfordnlp/SHP

**NOTE:** The [official example](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md#-one-single-script-completes-all-three-steps-of-rlhf-training-and-generate-your-first-chatgpt-model) runs the single GPU experiment on A6000-48GB GPU. This notebook uses A100-40GB GPU instead, which requires to enable `--gradient_checkpointing` to avoid CUDA OOM. So it takes longer time to compelete 3 steps RLHF training.

In [None]:
machine_type = "a2-highgpu-1g"
gpu_type = "NVIDIA_TESLA_A100"
num_gpus = 1

job_name = create_job_name("deepspeed-chat")
output_dir = f"/gcs/{GCS_BUCKET}/deepspeed-chat"

job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name, container_uri=TRAIN_DOCKER_URI, command=["python"]
)

job.run(
    args=[
        "train.py",
        f"--output-dir={output_dir}",
        "--actor-model=facebook/opt-1.3b",
        "--reward-model=facebook/opt-350m",
        "--deployment-type=single_node",
    ],
    boot_disk_size_gb=600,
    replica_count=1,
    machine_type=machine_type,
    accelerator_type=gpu_type,
    accelerator_count=num_gpus,
)

### Run any steps with custom settings

If you want to launch the job with any one or multiple steps with custom arguments, use the `run.py` script we provide. The following example runs only step-1 and step-3 with custom arguments. See [step-1](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py), [step-2](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py), [step-3](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py) runner for supported arguments respectively.

In [None]:
machine_type = "a2-highgpu-1g"
gpu_type = "NVIDIA_TESLA_A100"
num_gpus = 8

job_name = create_job_name("deepspeed-chat")
output_dir = f"/gcs/{GCS_BUCKET}/deepspeed-chat"

job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name, container_uri=TRAIN_DOCKER_URI, command=["python"]
)

job.run(
    args=[
        "run.py",
        "--step1_args",
        "data_path='Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets'",
        "data_split=2,4,4",
        "model_name_or_path=facebook/opt-1.3b",
        "per_device_train_batch_size=4",
        "per_device_eval_batch_size=4",
        "num_train_epochs=1",
        "zero_stage=2",
        "deepspeed",
        f"output_dir={output_dir}/step-1",
        "--step3_args",
        "data_path=Dahoas/rm-static",
        "data_split=2,4,4",
        f"actor_model_name_or_path={output_dir}/step-1"
        "critic_model_name_or_path=facebook/opt-350m",
        "num_padding_at_beginning=1",
        "per_device_train_batch_size=4",
        "per_device_mini_train_batch_size=4",
        "generation_batch_numbers=1",
        "ppo_epochs=1",
        "num_train_epochs=1",
        "deepspeed",
        "actor_zero_stage=2",
        "critic_zero_stage=2",
        "enable_ema",
        f"output_dir={output_dir}/step-3",
    ],
    boot_disk_size_gb=600,
    replica_count=1,
    machine_type=machine_type,
    accelerator_type=gpu_type,
    accelerator_count=num_gpus,
)

## Upload and Deploy models

This section uploads the fine-tuned model to Model Registry and deploys it on the Endpoint with one V100 GPU.

The model deployment step will take ~20 minutes to complete.

NOTE: The model weights will be downloaded after the deployment succeeds. Thus additional 5 minutes of waiting time is needed **after** the above model deployment step succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

In [None]:
# Set the model_id to "facebook/opt-1.3b" to load the OSS pre-trained model.
model, endpoint = deploy_model(
    model_id=f"gs://{GCS_BUCKET}/deepspeed-chat/actor-models/1.3b", task="chatbot"
)

Once deployment succeeds, you can send requests to the endpoint with text prompts. The model will generate text like a chatbot.

Example:
```
Human: What is a car?
 Assistant:  A car is a vehicle that is used for transportation.  It can be a vehicle that is used for driving, or it can be a vehicle that is used for riding.  It can also be a vehicle that is used for carrying passengers.

Human: Are you a car?
 Assistant:  I am a human.
```

In [None]:
chatbot = Chatbot(endpoint)

In [None]:
print(chatbot.talk("What is a car?"))

In [None]:
print(chatbot.talk("Are you a car?"))

The Chatbot class implements the conversation context mechanism for you. It will remember the conversation context by default when you can the `talk()` method. If you want to start a new conversation, you can set the second argument of `talk()` to `False` indicating it to clear the context.

In [None]:
print(chatbot.talk("What is your name?", False))

## Undeploy model and clean up resource

In [None]:
# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete models.
model.delete()