## Restart colab session after installation
Reload the session if something doesn't work

In [1]:
%%capture
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu

In [2]:
%%capture
!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

In [3]:
!nvidia-smi

Thu Jan  6 15:47:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import onnxruntime
print(onnxruntime.get_device())

GPU


## GPU inference mode
Select a runtime GPU to continue:

Click Runtime -> Change Runtime Type -> switch "Harware accelerator" to be GPU. Save it, and you maybe connect to GPU

### Torch CLIP

In [1]:
import clip
from PIL import Image
import numpy as np

# onnx cannot work with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)

# batch first
image = preprocess(Image.open("CLIP.png")).unsqueeze(0) # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
text = clip.tokenize(["a diagram", "a dog", "a cat"]) # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int64)

### CLIP-ONNX

In [4]:
from clip_onnx import clip_onnx, attention
clip.model.ResidualAttentionBlock.attention = attention

onnx_model = clip_onnx(model)
onnx_model.convert2onnx(image, text, verbose=False)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # GPU mode

  head_dim = q.shape[2] // num_heads
  "If indices include negative values, the exported graph will produce incorrect results.")


In [2]:
from clip_onnx import clip_onnx, attention
clip.model.ResidualAttentionBlock.attention = attention

In [3]:
onnx_model = clip_onnx(model)
onnx_model.load_onnx("/content/clip_visual.onnx",
                     "/content/clip_textual.onnx",
                     model.logit_scale.exp())
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # GPU mode

## Benchmark

In [4]:
model.cpu()
model.eval()
for x in model.parameters():
    x.requires_grad = False

In [7]:
import numpy, random, torch

In [9]:
def set_seed():
    torch.manual_seed(12)
    torch.cuda.manual_seed(12)
    np.random.seed(12)
    random.seed(12)

    torch.backends.cudnn.deterministic=True

In [10]:
import torch
import time

n = 5
clip_results = {"encode_image": [],
                "encode_text": []}
onnx_results = {"encode_image": [],
                "encode_text": []}
for batch in [2, 8, 16, 32, 64]:
    set_seed()
    image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cpu()
    text_input = torch.randint(320, 49407, (batch, 77)).cpu()
    image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)
    text_input_onnx = text_input.detach().cpu().numpy().astype(np.int64)

    t_mean = []
    for _ in range(n):
        t = time.time()
        onnx_model.encode_image(image_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            t = time.time()
            model.encode_image(image_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_image", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_image"].append([batch, round(sum(t_mean) / n, 3)])

    t_mean = []
    for _ in range(n):
        t = time.time()
        onnx_model.encode_text(text_input_onnx)
        t_mean.append(time.time() - t)
    print("onnx", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    onnx_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    with torch.inference_mode():
        t_mean = []
        for _ in range(n):
            t = time.time()
            model.encode_text(text_input)
            t_mean.append(time.time() - t)
    print("torch", batch, "encode_text", round(sum(t_mean) / n, 3))
    torch.cuda.empty_cache()
    clip_results["encode_text"].append([batch, round(sum(t_mean) / n, 3)])

    print("-" * 78)

onnx 2 encode_image 0.399
torch 2 encode_image 0.402
onnx 2 encode_text 0.219
torch 2 encode_text 0.269
------------------------------------------------------------------------------
onnx 8 encode_image 1.206
torch 8 encode_image 1.21
onnx 8 encode_text 0.791
torch 8 encode_text 0.849
------------------------------------------------------------------------------
onnx 16 encode_image 2.342
torch 16 encode_image 2.329
onnx 16 encode_text 1.542
torch 16 encode_text 1.678
------------------------------------------------------------------------------
onnx 32 encode_image 4.583
torch 32 encode_image 4.662
onnx 32 encode_text 3.199
torch 32 encode_text 3.492
------------------------------------------------------------------------------
onnx 64 encode_image 9.372
torch 64 encode_image 9.487
onnx 64 encode_text 6.885
torch 64 encode_text 7.049
------------------------------------------------------------------------------


In [11]:
import pandas as pd

In [12]:
pd.DataFrame({"backend": ["onnx", "torch"] * 5,
              "batch": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],
              "encode_image": [j[1] for i in zip(onnx_results["encode_image"],
                                              clip_results["encode_image"]) for j in i],
              "encode_text": [j[1] for i in zip(onnx_results["encode_text"],
                                              clip_results["encode_text"]) for j in i]})

Unnamed: 0,backend,batch,encode_image,encode_text
0,onnx,2,0.399,0.219
1,torch,2,0.402,0.269
2,onnx,8,1.206,0.791
3,torch,8,1.21,0.849
4,onnx,16,2.342,1.542
5,torch,16,2.329,1.678
6,onnx,32,4.583,3.199
7,torch,32,4.662,3.492
8,onnx,64,9.372,6.885
9,torch,64,9.487,7.049


In [13]:
onnx_df = pd.DataFrame({"CPU": ["onnx"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in onnx_results["encode_image"]],
              "encode_text": [i[1] for i in onnx_results["encode_text"]]})
onnx_df["summary"] = onnx_df["encode_image"] + onnx_df["encode_text"]

In [15]:
onnx_df

Unnamed: 0,CPU,batch,encode_image,encode_text,summary
0,onnx,2,0.399,0.219,0.618
1,onnx,8,1.206,0.791,1.997
2,onnx,16,2.342,1.542,3.884
3,onnx,32,4.583,3.199,7.782
4,onnx,64,9.372,6.885,16.257


In [26]:
print(onnx_df.to_markdown(index=False))

| CPU   |   batch |   encode_image |   encode_text |   summary |
|:------|--------:|---------------:|--------------:|----------:|
| onnx  |       2 |          0.399 |         0.219 |     0.618 |
| onnx  |       8 |          1.206 |         0.791 |     1.997 |
| onnx  |      16 |          2.342 |         1.542 |     3.884 |
| onnx  |      32 |          4.583 |         3.199 |     7.782 |
| onnx  |      64 |          9.372 |         6.885 |    16.257 |


In [16]:
clip_df = pd.DataFrame({"CPU": ["torch"] * 5,
              "batch": [2, 8, 16, 32, 64],
              "encode_image": [i[1] for i in clip_results["encode_image"]],
              "encode_text": [i[1] for i in clip_results["encode_text"]]})
clip_df["summary"] = clip_df["encode_image"] + clip_df["encode_text"]

In [25]:
print(clip_df.to_markdown(index=False))

| CPU   |   batch |   encode_image |   encode_text |   summary |
|:------|--------:|---------------:|--------------:|----------:|
| torch |       2 |          0.402 |         0.269 |     0.671 |
| torch |       8 |          1.21  |         0.849 |     2.059 |
| torch |      16 |          2.329 |         1.678 |     4.007 |
| torch |      32 |          4.662 |         3.492 |     8.154 |
| torch |      64 |          9.487 |         7.049 |    16.536 |
