In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving a stable diffusion model on Vertex AI

## Install dependencies

In [2]:
%%writefile requirements.txt
torchserve
torch-model-archiver
torch-workflow-archiver
google-cloud-aiplatform

Overwriting requirements.txt


In [3]:
%pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torchserve
  Downloading torchserve-0.7.0-py3-none-any.whl (19.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torch-model-archiver
  Downloading torch_model_archiver-0.7.0-py3-none-any.whl (14 kB)
Collecting torch-workflow-archiver
  Downloading torch_workflow_archiver-0.2.6-py3-none-any.whl (12 kB)
Collecting future
  Downloading future-0.18.3.tar.gz (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.9/840.9 kB[0m [31m184.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting enum-compat
  Downloading enum_compat-0.0.3-py3-none-any.whl (1.3 kB)
Collecting packaging
  Downloading packaging-21.3-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m140.0 MB/s[0m eta [36

In [4]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
!mkdir model_artifacts

## Create the custom TorchServe handler

In [2]:
%%writefile model_artifacts/handler.py

"""Customized handler for stable diffusion 2."""
import base64
import logging
from io import BytesIO

import torch
from diffusers import EulerDiscreteScheduler
from diffusers import StableDiffusionPipeline
from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)
model_id = 'stabilityai/stable-diffusion-2'


class ModelHandler(BaseHandler):

  def __init__(self):
    self.initialized = False
    self.map_location = None
    self.device = None
    self.use_gpu = True
    self.store_avg = True
    self.pipe = None

  def initialize(self, context):
    """Initializes the pipe."""
    properties = context.system_properties
    gpu_id = properties.get('gpu_id')

    self.map_location, self.device, self.use_gpu = \
      ('cuda', torch.device('cuda:' + str(gpu_id)),
       True) if torch.cuda.is_available() else \
        ('cpu', torch.device('cpu'), False)

    # Use the Euler scheduler here instead
    scheduler = EulerDiscreteScheduler.from_pretrained(model_id,
                                                       subfolder='scheduler')
    pipe = StableDiffusionPipeline.from_pretrained(model_id,
                                                   scheduler=scheduler,
                                                   torch_dtype=torch.float16)
    pipe = pipe.to('cuda')
    # Uncomment the following line to reduce the GPU memory usage.
    # pipe.enable_attention_slicing()
    self.pipe = pipe

    self.initialized = True

  def preprocess(self, requests):
    """Noting to do here."""
    logger.info('requests: %s', requests)
    return requests

  def inference(self, preprocessed_data, *args, **kwargs):
    """Run the inference."""
    images = []
    for pd in preprocessed_data:
      prompt = pd['prompt']
      images.extend(self.pipe(prompt).images)
    return images

  def postprocess(self, output_batch):
    """Converts the images to base64 string."""
    postprocessed_data = []
    for op in output_batch:
      fp = BytesIO()
      op.save(fp, format='JPEG')
      postprocessed_data.append(base64.b64encode(fp.getvalue()).decode('utf-8'))
      fp.close()
    return postprocessed_data


Writing model_artifacts/handler.py


## Create TorchServe model archive file

In [3]:
!torch-model-archiver \
  -f \
  --model-name model \
  --version 1.0 \
  --handler model_artifacts/handler.py \
  --export-path model_artifacts

In [4]:
!ls model_artifacts

handler.py  model.mar


In [15]:
BUCKET_NAME = "speech-erschmid"
GCS_PATH = f"gs://{BUCKET_NAME}/" # change this to a gcs path
FULL_GCS_PATH = f"{GCS_PATH}model_artifacts"
LOCATION = "us-west1"

In [6]:
!gsutil cp -r model_artifacts $GCS_PATH

Copying file://model_artifacts/handler.py [Content-Type=text/x-python]...
Copying file://model_artifacts/model.mar [Content-Type=application/octet-stream]...
/ [2 files][  3.3 KiB/  3.3 KiB]                                                
Operation completed over 2 objects/3.3 KiB.                                      


**Note** You need to upload the model that you trained with Dreambooth to the same Google Cloud Storage location.

In [16]:
!gsutil cp -r /home/jupyter/stable_diffusion_weights/output $FULL_GCS_PATH

Copying file:///home/jupyter/stable_diffusion_weights/output/model_index.json [Content-Type=application/json]...
Copying file:///home/jupyter/stable_diffusion_weights/output/model-dog.ckpt [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file:///home/jupyter/stable_diffusion_weights/output/args.json [Content-Ty

In [23]:
PROJECT_ID = "video-erschmid"  # <---CHANGE THIS TO YOUR PROJECT
CUSTOM_PREDICTOR_IMAGE_URI = f"us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.1-12:latest"
APP_NAME = "my-stable-diffusion"
print(f"CUSTOM_PREDICTOR_IMAGE_URI = {CUSTOM_PREDICTOR_IMAGE_URI}")

CUSTOM_PREDICTOR_IMAGE_URI = us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.1-12:latest


## Deploy to Vertex AI endpoint

In [18]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_NAME)

In [19]:
VERSION = 1
model_display_name = "stable_diffusion_2"
model_description = "stable_diffusion_2 container"

In [22]:
model = aiplatform.Model.upload(
    display_name=model_display_name,
    description=model_description,
    serving_container_image_uri=CUSTOM_PREDICTOR_IMAGE_URI,
    artifact_uri=FULL_GCS_PATH ,
)

model.wait()

print(model.display_name)
print(model.resource_name)

Creating Model
Create Model backing LRO: projects/147301782967/locations/us-west1/models/7406503838745624576/operations/3441447205683068928
Model created. Resource name: projects/147301782967/locations/us-west1/models/7406503838745624576@1
To use this Model in another session:
model = aiplatform.Model('projects/147301782967/locations/us-west1/models/7406503838745624576@1')
stable_diffusion_2
projects/147301782967/locations/us-west1/models/7406503838745624576


In [24]:
endpoint_display_name = f"{APP_NAME}-endpoint"
endpoint = aiplatform.Endpoint.create(display_name=endpoint_display_name)

Creating Endpoint
Create Endpoint backing LRO: projects/147301782967/locations/us-west1/endpoints/7728229737125904384/operations/7159731648030834688
Endpoint created. Resource name: projects/147301782967/locations/us-west1/endpoints/7728229737125904384
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/147301782967/locations/us-west1/endpoints/7728229737125904384')


In [25]:
traffic_percentage = 100
machine_type = "n1-standard-4"
accelerator_type = "NVIDIA_TESLA_T4"
accelerator_count = 1
deployed_model_display_name = model_display_name
min_replica_count = 1
max_replica_count = 1
sync = True

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=deployed_model_display_name,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    traffic_percentage=traffic_percentage,
    sync=sync,
)

Deploying model to Endpoint : projects/147301782967/locations/us-west1/endpoints/7728229737125904384
Deploy Endpoint model backing LRO: projects/147301782967/locations/us-west1/endpoints/7728229737125904384/operations/9088961138405670912


KeyboardInterrupt: 

## Getting a new image from the endpoint

In [26]:
instances = [{ "prompt": "A dog with a baseball jersey." }]
response = endpoint.predict(instances=instances)

In [36]:
import base64
import json
from IPython import display

with open('img5.jpg', 'wb') as g:
    g.write(base64.b64decode(response.predictions[0]))
    
display.Image('img5.jpg')