In [2]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

# with torch.no_grad(), torch.cuda.amp.autocast():
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[9.9950e-01, 4.1207e-04, 8.5316e-05]])


In [1]:
import open_clip
open_clip.list_pretrained()

  from .autonotebook import tqdm as notebook_tqdm


[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_s13m_b4k'),
 ('ViT-

In [1]:
import timm
from src.open_clip.tome import *
model = timm.create_model('vit_base_patch16_siglip_224_tome', pretrained=False, r_total=12)
# print(model)

import torch
# print(model)
bs = 2
dummy_input = torch.randn(bs, 3, 224, 224)
proj_feat = model(dummy_input)
print(proj_feat.shape)
feat_before_pooling, padding_mask = model.forward_features(dummy_input)
print(feat_before_pooling.shape, padding_mask)
if padding_mask is not None:
    print("num removed token:", 196 * bs - (padding_mask==0).sum())

  from .autonotebook import tqdm as notebook_tqdm


set total avg remove token nums each layer as:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
merge mode:  batch_level
> [0;32m/shared/nas/data/m1/wangz3/salesforce_intern/open_clip_merging/src/open_clip/tome.py[0m(661)[0;36mforward[0;34m()[0m
[0;32m    659 [0;31m        [0;32mif[0m [0mr[0m [0;34m>[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    660 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 661 [0;31m            [0mx[0m[0;34m,[0m [0mpadding_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mmerge_tokens[0m[0;34m([0m[0mmetric[0m[0;34m,[0m [0mr[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    662 [0;31m[0;34m[0m[0m
[0m[0;32m    663 [0;31m        [0mx[0m [0;34m=[0m [0mx[0m [0;34m+[0m [0mself[0m[0;34m.[0m[0mdrop_path2[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mls2[0m[0;34m([0m

In [18]:
196 * 2 - 2 * 12


368

In [2]:
import timm
from src.open_clip.tome import *
model = timm.create_model('vit_large_patch14_clip_336')

In [3]:
print(model.patch_embed.img_size)
print(model.embed_dim)
print(len(model.blocks))
print(model.blocks[0].attn.num_heads)

(336, 336)
1024
24
16


In [4]:
model

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (

In [1]:
import timm
from src.open_clip.tome import *
model = timm.create_model('openai_clip_vit_l14_336_tome', pretrained=True) # class token == True for pretrained

  from .autonotebook import tqdm as notebook_tqdm


set total avg remove token nums each layer as:  [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
merge mode:  batch_level


RuntimeError: Error(s) in loading state_dict for ToMEVisionTransformer:
	Missing key(s) in state_dict: "patch_embed.proj.bias", "attn_pool.latent", "attn_pool.q.weight", "attn_pool.q.bias", "attn_pool.kv.weight", "attn_pool.kv.bias", "attn_pool.proj.weight", "attn_pool.proj.bias", "attn_pool.norm.weight", "attn_pool.norm.bias", "attn_pool.mlp.fc1.weight", "attn_pool.mlp.fc1.bias", "attn_pool.mlp.fc2.weight", "attn_pool.mlp.fc2.bias". 
	Unexpected key(s) in state_dict: "norm_pre.bias", "norm_pre.weight". 

In [2]:
print(model.patch_embed.img_size)
print(model.embed_dim)
print(len(model.blocks))
print(model.blocks[0].attn.num_heads)

(336, 336)
1024
24
16


In [5]:
import torch
print(torch.hub.get_dir())

/home/wangz3/.cache/torch/hub
