<a href="https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/dev/examples/dev/clip_onnx_benchmark_cpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Restart colab session after installation
Reload the session if something doesn't work

In [1]:
%%capture
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu

In [1]:
%%capture
!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

In [2]:
!nvidia-smi

Tue May  3 06:56:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import onnxruntime
print(onnxruntime.get_device())

GPU


## GPU inference mode
Select a runtime GPU to continue:

Click Runtime -> Change Runtime Type -> switch "Harware accelerator" to be GPU. Save it, and you maybe connect to GPU

### Torch CLIP

In [4]:
import clip
from PIL import Image
import numpy as np

# onnx cannot work with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

# batch first
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)  # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = clip.tokenize(["a diagram", "a dog", "a cat"]) # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int32)

### CLIP-ONNX

In [6]:
from clip_onnx import clip_onnx

onnx_model = clip_onnx(model)
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # GPU mode

[CLIP ONNX] Start convert visual model
[CLIP ONNX] Start check visual model
[CLIP ONNX] Start convert textual model


  "If indices include negative values, the exported graph will produce incorrect results.")


[CLIP ONNX] Start check textual model
[CLIP ONNX] Models converts successfully


In [7]:
onnx_model = clip_onnx(model)
onnx_model.load_onnx("/content/clip_visual.onnx",
                     "/content/clip_textual.onnx",
                     model.logit_scale.exp())
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # GPU mode

In [8]:
onnx_model.visual_session.get_providers()

['CPUExecutionProvider']

## Benchmark

In [9]:
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

In [10]:
model.eval()
for x in model.parameters():
    x.requires_grad = False

In [11]:
import numpy, random, torch

In [12]:
def set_seed():
    torch.manual_seed(12)
    torch.cuda.manual_seed(12)
    np.random.seed(12)
    random.seed(12)

    torch.backends.cudnn.deterministic=True

In [13]:
import torch
import time

n = 5
clip_results = {"encode_image": [],
                "encode_text": []}
onnx_results = {"encode_image": [],
                "encode_text": []}
for batch in [2, 8, 16, 32, 64]:
    set_seed()
    t_mean = []
    for _ in range(n):
        image_input = torch.randint(1, 255, (batch, 3, 224, 224))
        image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)
        t = time.time()
        onnx_model.encode_image(image_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            image_input = torch.randint(1, 255, (batch, 3, 224, 224))
            t = time.time()
            model.encode_image(image_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    t_mean = []
    for _ in range(n):
        text_input = torch.randint(320, 49407, (batch, 77))
        text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)
        t = time.time()
        onnx_model.encode_text(text_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    set_seed()
    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            text_input = torch.randint(320, 49407, (batch, 77))
            t = time.time()
            model.encode_text(text_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    print("-" * 78)

onnx 2 encode_image 0.234
torch 2 encode_image 0.343
onnx 2 encode_text 0.162
torch 2 encode_text 0.243
------------------------------------------------------------------------------
onnx 8 encode_image 0.923
torch 8 encode_image 1.093
onnx 8 encode_text 0.656
torch 8 encode_text 0.831
------------------------------------------------------------------------------
onnx 16 encode_image 2.079
torch 16 encode_image 1.952
onnx 16 encode_text 1.288
torch 16 encode_text 1.523
------------------------------------------------------------------------------
onnx 32 encode_image 3.937
torch 32 encode_image 4.079
onnx 32 encode_text 2.658
torch 32 encode_text 3.015
------------------------------------------------------------------------------
onnx 64 encode_image 7.944
torch 64 encode_image 8.07
onnx 64 encode_text 5.567
torch 64 encode_text 6.212
------------------------------------------------------------------------------


In [14]:
import pandas as pd

In [15]:
pd.DataFrame({"backend": ["onnx", "torch"] * 5,
              "batch": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],
              "encode_image": [j[1] for i in zip(onnx_results["encode_image"],
                                              clip_results["encode_image"]) for j in i],
              "encode_text": [j[1] for i in zip(onnx_results["encode_text"],
                                              clip_results["encode_text"]) for j in i]})

Unnamed: 0,backend,batch,encode_image,encode_text
0,onnx,2,0.234,0.162
1,torch,2,0.343,0.243
2,onnx,8,0.923,0.656
3,torch,8,1.093,0.831
4,onnx,16,2.079,1.288
5,torch,16,1.952,1.523
6,onnx,32,3.937,2.658
7,torch,32,4.079,3.015
8,onnx,64,7.944,5.567
9,torch,64,8.07,6.212


In [16]:
onnx_df = pd.DataFrame({"ONNX": ["ViT-B/32"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in onnx_results["encode_image"]],
              "encode_text": [i[1] for i in onnx_results["encode_text"]]})
onnx_df["total"] = onnx_df["encode_image"] + onnx_df["encode_text"]

In [17]:
onnx_df

Unnamed: 0,ONNX,batch,encode_image,encode_text,total
0,ViT-B/32,2,0.234,0.162,0.396
1,ViT-B/32,8,0.923,0.656,1.579
2,ViT-B/32,16,2.079,1.288,3.367
3,ViT-B/32,32,3.937,2.658,6.595
4,ViT-B/32,64,7.944,5.567,13.511


In [18]:
print(onnx_df.to_markdown(index=False))

| ONNX     |   batch |   encode_image |   encode_text |   total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 |       2 |          0.234 |         0.162 |   0.396 |
| ViT-B/32 |       8 |          0.923 |         0.656 |   1.579 |
| ViT-B/32 |      16 |          2.079 |         1.288 |   3.367 |
| ViT-B/32 |      32 |          3.937 |         2.658 |   6.595 |
| ViT-B/32 |      64 |          7.944 |         5.567 |  13.511 |


In [19]:
clip_df = pd.DataFrame({"TORCH": ["ViT-B/32"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in clip_results["encode_image"]],
              "encode_text": [i[1] for i in clip_results["encode_text"]]})
clip_df["total"] = clip_df["encode_image"] + clip_df["encode_text"]

In [20]:
print(clip_df.to_markdown(index=False))

| TORCH    |   batch |   encode_image |   encode_text |   total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 |       2 |          0.343 |         0.243 |   0.586 |
| ViT-B/32 |       8 |          1.093 |         0.831 |   1.924 |
| ViT-B/32 |      16 |          1.952 |         1.523 |   3.475 |
| ViT-B/32 |      32 |          4.079 |         3.015 |   7.094 |
| ViT-B/32 |      64 |          8.07  |         6.212 |  14.282 |
