In [1]:
import utilities
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import open_clip
import os
import torch.nn.utils.prune as prune

In [25]:
model.transformer.resblocks[2]

ResidualAttentionBlock(
  (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
  )
  (ls_1): Identity()
  (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (mlp): Sequential(
    (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
    (gelu): GELU(approximate='none')
    (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
  )
  (ls_2): Identity()
)

In [None]:
checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k-ArcFace(k=3)-All-Reduce-LR-1/model_epoch_6_mAP3_0.58_slim_f16.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual   
model.load_state_dict(checkpoint)

tmp = ()
for m in model.transformer.resblocks:
    tmp = tmp + ((m.ln_1, 'weight'),)
    tmp = tmp + ((m.attn.out_proj, 'weight'),)
    tmp = tmp + ((m.ln_2, 'weight'),)
    tmp = tmp + ((m.mlp.c_fc, 'weight'),)
    tmp = tmp + ((m.mlp.c_proj, 'weight'),)

parameters_to_prune = tmp
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=10,
)
print(sum(torch.nn.utils.parameters_to_vector(model.buffers()) == 0))

In [2]:
class Head(nn.Module):
    def __init__(self, hidden_size, slim):
        super(Head, self).__init__()
        self.emb = nn.Linear(hidden_size, CFG.emb_size, bias=False)
        self.slim = slim
        if not slim:
            self.arc = utilities.ArcMarginProduct_subcenter(CFG.emb_size, CFG.n_classes)

        self.dropout = utilities.Multisample_Dropout()

    def forward(self, x):
        embeddings = self.dropout(x, self.emb)
        if not self.slim:
            output = self.arc(embeddings)
            return output, F.normalize(embeddings)
        return F.normalize(embeddings)
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size, version='v1', slim=False):
        super(Model, self).__init__()
        if version == 'v1': 
            self.encoder = vit_backbone.visual
        elif version == 'v2':
            self.encoder = vit_backbone.visual.trunk
            
        self.head = Head(head_size, slim)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)


In [None]:
path_list =  [
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-cut_out-product-10k/model_best_epoch_2_mAP3_0.55_slim.pt',
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-product-10k/model_best_epoch_3_mAP3_0.54_slim.pt',
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-None-product-10k/model_best_epoch_2_mAP3_0.53_slim.pt',
  f'my_experiments/vit_h_224_products-10k/model_best_epoch_3_mAP3_0.53_slim.pt'
]

class CFG:
    emb_size=512
    model_name = 'ViT-H-14'
    hidden_layer = 1024
    version = 'v1'
    n_classes=9004


backbone, _, _ = open_clip.create_model_and_transforms(CFG.model_name, None)
# Load models weights
weight_list = []

for path in path_list:
    model = Model(backbone, CFG.hidden_layer, CFG.version, True)
    model.load_state_dict(torch.load(path), strict=False)
    weight_list.append(model.state_dict())

# Average weights
state_dict = dict((k, torch.stack([v[k] for v in weight_list]).mean(0)) for k in weight_list[0])
model.load_state_dict(state_dict)

torch.save(model.state_dict(), f'my_experiments/{CFG.model_name}-soup.pt')

In [None]:
def slim_model(model_path, CFG):
    name = os.path.splitext(model_path)[0]
    
    checkpoint = torch.load(model_path)
    backbone, _, _ = open_clip.create_model_and_transforms(CFG.model_name, None)

    model = Model(backbone, CFG.hidden_layer, CFG.version)
    model.load_state_dict(checkpoint['model_state_dict'])

    model_slim = Model(backbone, CFG.hidden_layer, CFG.version, True)
    model_slim.head.emb = model.head.emb
    model_slim.encoder = model.encoder

    torch.save(model_slim.state_dict(), 
               name + '_slim.pt')
    
class CFG:
    emb_size=512
    model_name = 'ViT-H-14'
    hidden_layer = 1024
    version = 'v1'
    n_classes=9691
    

slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_1_mAP5_0.55.pt', CFG())
slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_2_mAP5_0.55.pt', CFG())
slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_3_mAP5_0.55.pt', CFG())


In [2]:
backbone, t, p = open_clip.create_model_and_transforms('ViT-H-14', None)
t, p

(Compose(
     RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic), antialias=None)
     <function _convert_to_rgb at 0x7fb1993af9d0>
     ToTensor()
     Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
 ),
 Compose(
     Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
     CenterCrop(size=(224, 224))
     <function _convert_to_rgb at 0x7fb1993af9d0>
     ToTensor()
     Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
 ))

In [None]:

checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)

model = Model(backbone, 1024, 'v2')
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = Model(backbone, 1024, 'v2', True)
model_slim.head.emb = model.head.emb
model_slim.encoder = model.encoder


torch.save(model_slim.state_dict(), 
           f'my_experiments/ViT-L-14-laion2b_s32b_b82k-image_net-v2-product-10k-ArcFace(k=3)-All/model_best_epoch_4_mAP3_0.55_slim.pt')

In [2]:
import utilities
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import open_clip
import os

class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = None

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    
checkpoint = torch.load(f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'], strict=False)

model_slim = model.encoder
model_slim.half()

torch.save(model_slim.state_dict(), 
           f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57_slim.pt')

In [1]:
class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = utilities.ArcMarginProduct_subcenter(768, 9691, 1)

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    
checkpoint = torch.load(f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-L-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = model.encoder


torch.save(model_slim.state_dict(), 
           f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57_slim.pt')

NameError: name 'nn' is not defined

In [None]:
checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_best_epoch_2_mAP3_0.57_slim.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual
model.load_state_dict(checkpoint)

In [1]:
import torch
import open_clip
import torch.nn as nn
import torch.nn.functional as F
import utilities

class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = utilities.ArcMarginProduct_subcenter(1024, 9004, 3)

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    

checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = model.encoder
model_slim.half()

frozen_mod = torch.jit.optimize_for_inference(torch.jit.script(model_slim.eval()))

In [2]:
frozen_mod.save(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57_slim_f16_jit.pt')

In [22]:
a = th.rand((10000, 512)).cuda()
b = th.rand((9000, 512)).cuda()


In [38]:
import torch as th


c = th.cdist(b, a)
i = th.argsort(c, dim=1)[:, :1000]

In [41]:
c[0].argmin()

tensor(3824, device='cuda:0')

In [42]:
i[0][0]

tensor(3824, device='cuda:0')

In [None]:
    def predict_product_ranks(self):
        """
        This function should return a numpy array of shape `(num_queries, 1000)`. 
        For ach query image your model will need to predict 
        a set of 1000 unique gallery indexes, in order of best match first.

        Outputs:
            class_ranks - A 2D numpy array where the axes correspond to:
                          axis 0 - Batch size
                          axis 1 - An ordered rank list of matched image indexes, most confident prediction first
                            - maximum length of this should be 1000
                            - predictions above this limit will be dropped
                            - duplicates will be dropped such that the lowest index entry is preserved
        """

        gallery_dataset = SubmissionDataset(
            root=self.dataset_path, annotation_file=self.gallery_csv_path,
            transforms=self.final_transform
        )

        gallery_loader = DataLoader(
            gallery_dataset, batch_size=self.batch_size,
            shuffle=False, pin_memory=True, num_workers=self.num_workers
        )

        query_dataset = SubmissionDataset(
            root=self.dataset_path, annotation_file=self.queries_csv_path,
            transforms=self.final_transform, with_bbox=True
        )

        query_loader = DataLoader(
            query_dataset, batch_size=self.batch_size,
            shuffle=False, pin_memory=True, num_workers=self.num_workers
        )

        print('Calculating embeddings')
        gallery_embeddings = th.zeros((len(gallery_dataset), self.embedding_shape), dtype=th.float16, device=self.device)
        query_embeddings = th.zeros((len(query_dataset), self.embedding_shape), dtype=th.float16, device=self.device)
        s = time.time()
        with th.no_grad():
            for i, images in tqdm(enumerate(gallery_loader),
                                total=len(gallery_loader)):
                images = images.to(self.device)
                outputs = self.model(images.half())
                outputs = th.squeeze(outputs.data)
                gallery_embeddings[
                    i*self.batch_size:(i*self.batch_size + self.batch_size), :
                ] = outputs
            
            for i, images in tqdm(enumerate(query_loader),
                                total=len(query_loader)):
                images = images.to(self.device)
                outputs = self.model(images.half())
                outputs = th.squeeze(outputs.data)
                query_embeddings[
                    i*self.batch_size:(i*self.batch_size + self.batch_size), :
                ] = outputs
        e = time.time()
        print(f'Total Time Passed to Calculate the Embeddings {e - s}s')

        print('Calculating distances')
        s = time.time()        
        dist = th.cdist(query_embeddings, gallery_embeddings)
        ind = th.argsort(dist, dim=1)[:, :1000]
        ind = ind.data.cpu().numpy()
        
        e = time.time()
        print(f'Total Time Passed to Calculate Distance {e - s}s')
        return ind

In [21]:
from ultralytics import YOLO
import torch as th


model = YOLO("yolov8n.pt")
image = th.rand((16,3,320,640))
results = model(image)

Ultralytics YOLOv8.0.48 🚀 Python-3.8.16 torch-1.13.1+cu117 CUDA:0 (NVIDIA GeForce RTX 3090, 24252MiB)
YOLOv8n summary (fused): 168 layers, 3151904 parameters, 0 gradients, 8.7 GFLOPs



Exception: Unsupported type encountered! See docs for supported types https://docs.ultralytics.com/predict

In [10]:
from ultralytics import YOLO
import torch as th


model = YOLO("yolov8n.pt").model.to('cuda')
image = th.rand((16,3,320,640), device='cuda')
results = model(image)

In [14]:
a, b = results
a.shape

torch.Size([16, 84, 4200])

In [17]:
b1, b2, b3 = b
b1.shape, b2.shape, b3.shape

(torch.Size([16, 144, 40, 80]),
 torch.Size([16, 144, 20, 40]),
 torch.Size([16, 144, 10, 20]))

In [19]:
ultralytics.__version__

'8.0.48'