<a href="https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/dev/examples/dev/clip_onnx_benchmark_gpu_K80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Restart colab session after installation
Reload the session if something doesn't work

In [2]:
%%capture
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu

In [3]:
%%capture
!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

In [1]:
!nvidia-smi

Tue May  3 07:20:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import onnxruntime
print(onnxruntime.get_device())

GPU


## GPU inference mode
Select a runtime GPU to continue:

Click Runtime -> Change Runtime Type -> switch "Harware accelerator" to be GPU. Save it, and you maybe connect to GPU

### Torch CLIP

In [3]:
import clip
from PIL import Image
import numpy as np

# onnx cannot work with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

# batch first
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)  # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = clip.tokenize(["a diagram", "a dog", "a cat"]) # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int32)

### CLIP-ONNX

In [5]:
from clip_onnx import clip_onnx
from clip_onnx.utils import DEFAULT_EXPORT

DEFAULT_EXPORT["opset_version"] = 15

onnx_model = clip_onnx(model)
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # GPU mode

[CLIP ONNX] Start convert visual model




[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model


  "If indices include negative values, the exported graph will produce incorrect results.")


[CLIP ONNX] Start check textual model
[CLIP ONNX] Models converts successfully


In [6]:
onnx_model = clip_onnx(model)
onnx_model.load_onnx("/content/clip_visual.onnx",
                     "/content/clip_textual.onnx",
                     model.logit_scale.exp())
onnx_model.start_sessions(providers=["CUDAExecutionProvider"]) # GPU mode

In [7]:
onnx_model.visual_session.get_providers()

['CUDAExecutionProvider', 'CPUExecutionProvider']

## Benchmark

In [8]:
model, preprocess = clip.load("ViT-B/32", device="cuda", jit=False)

In [9]:
model.eval()
for x in model.parameters():
    x.requires_grad = False

In [10]:
import numpy, random, torch

In [11]:
def set_seed():
    torch.manual_seed(12)
    torch.cuda.manual_seed(12)
    np.random.seed(12)
    random.seed(12)

    torch.backends.cudnn.deterministic=True

In [12]:
import torch
import time

n = 5
clip_results = {"encode_image": [],
                "encode_text": []}
onnx_results = {"encode_image": [],
                "encode_text": []}
for batch in [2, 8, 16, 32, 64]:
    set_seed()
    t_mean = []
    for _ in range(n):
        image_input = torch.randint(1, 255, (batch, 3, 224, 224))
        image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)
        t = time.time()
        onnx_model.encode_image(image_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()
            t = time.time()
            model.encode_image(image_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    t_mean = []
    for _ in range(n):
        text_input = torch.randint(320, 49407, (batch, 77))
        text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)
        t = time.time()
        onnx_model.encode_text(text_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            text_input = torch.randint(320, 49407, (batch, 77)).cuda()
            t = time.time()
            model.encode_text(text_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    print("-" * 78)

onnx 2 encode_image 0.136
torch 2 encode_image 0.02
onnx 2 encode_text 0.021
torch 2 encode_text 0.035
------------------------------------------------------------------------------
onnx 8 encode_image 0.054
torch 8 encode_image 0.081
onnx 8 encode_text 0.04
torch 8 encode_text 0.098
------------------------------------------------------------------------------
onnx 16 encode_image 0.089
torch 16 encode_image 0.207
onnx 16 encode_text 0.071
torch 16 encode_text 0.196
------------------------------------------------------------------------------
onnx 32 encode_image 0.158
torch 32 encode_image 0.44
onnx 32 encode_text 0.134
torch 32 encode_text 0.374
------------------------------------------------------------------------------
onnx 64 encode_image 0.325
torch 64 encode_image 0.919
onnx 64 encode_text 0.258
torch 64 encode_text 0.719
------------------------------------------------------------------------------


In [13]:
import pandas as pd

In [14]:
pd.DataFrame({"backend": ["onnx", "torch"] * 5,
              "batch": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],
              "encode_image": [j[1] for i in zip(onnx_results["encode_image"],
                                              clip_results["encode_image"]) for j in i],
              "encode_text": [j[1] for i in zip(onnx_results["encode_text"],
                                              clip_results["encode_text"]) for j in i]})

Unnamed: 0,backend,batch,encode_image,encode_text
0,onnx,2,0.136,0.021
1,torch,2,0.02,0.035
2,onnx,8,0.054,0.04
3,torch,8,0.081,0.098
4,onnx,16,0.089,0.071
5,torch,16,0.207,0.196
6,onnx,32,0.158,0.134
7,torch,32,0.44,0.374
8,onnx,64,0.325,0.258
9,torch,64,0.919,0.719


In [15]:
onnx_df = pd.DataFrame({"ONNX": ["ViT-B/32"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in onnx_results["encode_image"]],
              "encode_text": [i[1] for i in onnx_results["encode_text"]]})
onnx_df["total"] = onnx_df["encode_image"] + onnx_df["encode_text"]

In [16]:
onnx_df

Unnamed: 0,ONNX,batch,encode_image,encode_text,total
0,ViT-B/32,2,0.136,0.021,0.157
1,ViT-B/32,8,0.054,0.04,0.094
2,ViT-B/32,16,0.089,0.071,0.16
3,ViT-B/32,32,0.158,0.134,0.292
4,ViT-B/32,64,0.325,0.258,0.583


In [17]:
print(onnx_df.to_markdown(index=False))

| ONNX     |   batch |   encode_image |   encode_text |   total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 |       2 |          0.136 |         0.021 |   0.157 |
| ViT-B/32 |       8 |          0.054 |         0.04  |   0.094 |
| ViT-B/32 |      16 |          0.089 |         0.071 |   0.16  |
| ViT-B/32 |      32 |          0.158 |         0.134 |   0.292 |
| ViT-B/32 |      64 |          0.325 |         0.258 |   0.583 |


In [18]:
clip_df = pd.DataFrame({"TORCH": ["ViT-B/32"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in clip_results["encode_image"]],
              "encode_text": [i[1] for i in clip_results["encode_text"]]})
clip_df["total"] = clip_df["encode_image"] + clip_df["encode_text"]

In [19]:
print(clip_df.to_markdown(index=False))

| TORCH    |   batch |   encode_image |   encode_text |   total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 |       2 |          0.02  |         0.035 |   0.055 |
| ViT-B/32 |       8 |          0.081 |         0.098 |   0.179 |
| ViT-B/32 |      16 |          0.207 |         0.196 |   0.403 |
| ViT-B/32 |      32 |          0.44  |         0.374 |   0.814 |
| ViT-B/32 |      64 |          0.919 |         0.719 |   1.638 |
