In [11]:
import torch
from torchvision.models import resnet18
import torch_pruning as tp

model = resnet18(pretrained=True).eval()

# 1. build dependency graph for resnet18
DG = tp.DependencyGraph().build_dependency(model, example_inputs=torch.randn(1,3,224,224))

# 2. Specify the to-be-pruned channels. Here we prune those channels indexed by [2, 6, 9].
group = DG.get_pruning_group( model.conv1, tp.prune_conv_out_channels, idxs=[2, 6, 9] )

# 3. prune all grouped layers that are coupled with model.conv1 (included).
if DG.check_pruning_group(group): # avoid full pruning, i.e., channels=0.
    group.prune()

In [9]:
model.conv1

Conv2d(3, 61, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

In [10]:
from torch import nn

for group in DG.get_all_groups(ignored_layers=[model.conv1], root_module_types=[nn.Conv2d, nn.Linear]):
    # handle groups in sequential order
    idxs = [2,4,6] # your pruning indices
    group.prune(idxs=idxs)
    print(group)


--------------------------------
          Pruning Group
--------------------------------
[0] prune_out_channels on fc (Linear(in_features=512, out_features=997, bias=True)) => prune_out_channels on fc (Linear(in_features=512, out_features=997, bias=True)), #idxs=1000
--------------------------------


--------------------------------
          Pruning Group
--------------------------------
[0] prune_out_channels on layer4.0.downsample.0 (Conv2d(256, 509, kernel_size=(1, 1), stride=(2, 2), bias=False)) => prune_out_channels on layer4.0.downsample.0 (Conv2d(256, 509, kernel_size=(1, 1), stride=(2, 2), bias=False)), #idxs=512
[1] prune_out_channels on layer4.0.downsample.0 (Conv2d(256, 509, kernel_size=(1, 1), stride=(2, 2), bias=False)) => prune_out_channels on layer4.0.downsample.1 (BatchNorm2d(509, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)), #idxs=512
[2] prune_out_channels on layer4.0.downsample.1 (BatchNorm2d(509, eps=1e-05, momentum=0.1, affine=True, track_ru

# Quantization

In [1]:
import torch
import open_clip

# Load the trained model
checkpoint = torch.load(f'my_experiments/soup_slim_p10k_h_m_image_net_happy_whale_w_03.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual   
model.load_state_dict(checkpoint)
model.eval()

VisionTransformer(
  (conv1): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False)
  (patch_dropout): Identity()
  (ln_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (transformer): Transformer(
    (resblocks): ModuleList(
      (0): ResidualAttentionBlock(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
        )
        (ls_1): Identity()
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (ls_2): Identity()
      )
      (1): ResidualAttentionBlock(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadA

In [3]:
model(torch.randn(1,3,224,224).cuda()).shape

torch.Size([1, 1024])

In [7]:
model.proj.shape

torch.Size([1280, 1024])

In [2]:

from torchsummary import summary
model.cuda()
summary(model, (3, 224, 224))

Layer (type:depth-idx)                             Output Shape              Param #
├─Conv2d: 1-1                                      [-1, 1280, 16, 16]        752,640
├─Identity: 1-2                                    [-1, 257, 1280]           --
├─LayerNorm: 1-3                                   [-1, 257, 1280]           2,560
├─Transformer: 1-4                                 [-1, 2, 1280]             --
|    └─ModuleList: 2                               []                        --
|    |    └─ResidualAttentionBlock: 3-1            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-2            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-3            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-4            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-5            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-6            [-1, 2, 1280]   

Layer (type:depth-idx)                             Output Shape              Param #
├─Conv2d: 1-1                                      [-1, 1280, 16, 16]        752,640
├─Identity: 1-2                                    [-1, 257, 1280]           --
├─LayerNorm: 1-3                                   [-1, 257, 1280]           2,560
├─Transformer: 1-4                                 [-1, 2, 1280]             --
|    └─ModuleList: 2                               []                        --
|    |    └─ResidualAttentionBlock: 3-1            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-2            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-3            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-4            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-5            [-1, 2, 1280]             19,677,440
|    |    └─ResidualAttentionBlock: 3-6            [-1, 2, 1280]   

In [6]:
# get product 10k
import os
import math


import numpy as np
 
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import open_clip

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torchvision.transforms as T

from tqdm import tqdm
import pandas as pd
import cv2
from PIL import Image

import utilities

def read_img(img_path, is_gray=False):
    mode = cv2.IMREAD_COLOR if not is_gray else cv2.IMREAD_GRAYSCALE
    img = cv2.imread(img_path, mode)
    if not is_gray:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

class ProductDataset(Dataset):
    def __init__(self, 
                 img_dir, 
                 annotations_file, 
                 transform=None, 
                 final_transform=None, 
                 headers=None,
                 test_mode=False):
        self.data = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.final_transform = final_transform
        self.headers = {"img_path": "img_path", "product_id": "product_id"}
        if headers:
            self.headers = headers
        self.test_mode = test_mode
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.data[self.headers["img_path"]][idx])
        
        img = read_img(img_path)
        if self.test_mode:
            x, y, w, h = self.data["bbox_x"][idx], self.data["bbox_y"][idx], \
                         self.data["bbox_w"][idx], self.data["bbox_h"][idx]
            img = img[y:y+h, x:x+w]
            
        
        if self.transform is not None:
            img = transform(image=img)["image"]
        
        if self.final_transform is not None:
            if isinstance(img, np.ndarray):
                img =  Image.fromarray(img)
            img = self.final_transform(img)
            
        product_id = self.data[self.headers["product_id"]][idx]
        return img, product_id
    
def get_final_transform():  
    final_transform = T.Compose([
            T.Resize(
                size=(224, 224), 
                interpolation=T.InterpolationMode.BICUBIC,
                antialias=True),
            T.ToTensor(), 
            T.Normalize(
                mean=(0.48145466, 0.4578275, 0.40821073), 
                std=(0.26862954, 0.26130258, 0.27577711)
            )
        ])
    return final_transform

final_transform = get_final_transform()
img_dir = "../development_test_data"
dataset_test = ProductDataset(img_dir, os.path.join(img_dir, "queries.csv"), None, final_transform, test_mode=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, num_workers=4)

In [7]:
device = torch.device('cpu')
# Move the model to the GPU
model.to(device)

# Specify quantization configuration
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')

# Prepare the model for static quantization
torch.quantization.prepare(model, inplace=True)

# Calibrate the prepared model to determine quantization parameters
with torch.no_grad():
    for imgs, p_id in tqdm(dataloader_test):
        model(imgs)
        break

# Convert to a quantized model
quantized_model = torch.quantization.convert(model)


  0%|                                                                                                                           | 0/61 [00:51<?, ?it/s]


In [9]:
quantized_model.cuda()
imgs.cuda()
quantized_model(imgs)

NotImplementedError: Could not run 'quantized::conv2d.new' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv2d.new' is only available for these backends: [QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PythonDispatcher].

QuantizedCPU: registered at ../aten/src/ATen/native/quantized/cpu/qconv.cpp:1449 [kernel]
QuantizedCUDA: registered at ../aten/src/ATen/native/quantized/cudnn/Conv.cpp:418 [kernel]
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:140 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:488 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:291 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at ../aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:35 [backend fallback]
AutogradCPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:39 [backend fallback]
AutogradCUDA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:47 [backend fallback]
AutogradXLA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:51 [backend fallback]
AutogradMPS: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:59 [backend fallback]
AutogradXPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:43 [backend fallback]
AutogradHPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:68 [backend fallback]
AutogradLazy: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:55 [backend fallback]
Tracer: registered at ../torch/csrc/autograd/TraceTypeManual.cpp:296 [backend fallback]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:482 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:743 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:28 [backend fallback]
Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1064 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:189 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:148 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:484 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:144 [backend fallback]


# ONNX Runtime

In [2]:
import torch.onnx
import torch
import open_clip

# Load your pre-trained ViT-H model
checkpoint = torch.load(f'my_experiments/soup_slim_p10k_h_m_image_net_happy_whale_w_03.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual   
model.load_state_dict(checkpoint)
model.eval()
model.half()

# Set the device to run on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create some dummy input data in the shape of your model's input
dummy_input = torch.randn(1, 3, 224, 224, device=device).half()

# Define the dynamic axes for the input and output tensors
input_names = ["input"]
output_names = ["output"]
dynamic_axes = {
    "input": {0: "batch_size"},
    "output": {0: "batch_size"}
}

# Export the model to ONNX format with dynamic axes
torch.onnx.export(
    model,
    dummy_input,
    "my_experiments/vit-h.onnx",
    input_names=input_names,
    output_names=output_names,
    dynamic_axes=dynamic_axes
)

In [1]:
import onnxruntime as rt
import numpy as np

# Load the ONNX model
sess = rt.InferenceSession("my_experiments/vit-h-quantize.onnx", providers=["CUDAExecutionProvider"])

input_data = np.random.random((10, 3, 224, 224)).astype(np.float16)

output = sess.run(None, {"input": input_data})

2023-04-13 22:52:39.919030688 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:541 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Please reference https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements to ensure all dependencies are met.


In [4]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
quantized_model = quantize_dynamic("my_experiments/vit-h.onnx", "my_experiments/vit-h-quantize.onnx")



Ignore MatMul due to non constant B: /[/transformer/resblocks.0/attn/MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.0/attn/MatMul_1]
Ignore MatMul due to non constant B: /[/transformer/resblocks.0/attn/MatMul_2]
Ignore MatMul due to non constant B: /[/transformer/resblocks.0/attn/Gemm_MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.0/mlp/c_fc/MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.0/mlp/c_proj/MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/attn/MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/attn/MatMul_1]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/attn/MatMul_2]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/attn/Gemm_MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/mlp/c_fc/MatMul]
Ignore MatMul due to non constant B: /[/transformer/resblocks.1/mlp/c_proj/MatMul]
Ignore MatMul due to non constant B: /

## Prunning

In [10]:
import torch
import open_clip
import torch.nn.utils.prune as prune
import tqdm

def prune_model_v1(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Conv2d):
            print(f'prunning {name}')
            prune.l1_unstructured(module, name='weight', amount=0.1)
            prune.remove(module, 'weight')
        elif isinstance(module, torch.nn.Linear):
            print(f'prunning {name}')
            prune.l1_unstructured(module, name='weight', amount=0.1)
            prune.remove(module, 'weight')
            
    return model

def prune_model_v2(model):
    parameters_to_prune = ()
    for m in model.transformer.resblocks:
        parameters_to_prune = parameters_to_prune + ((m.ln_1, 'weight'),)
        parameters_to_prune = parameters_to_prune + ((m.attn.out_proj, 'weight'),)
        parameters_to_prune = parameters_to_prune + ((m.ln_2, 'weight'),)
    
    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=0.1,
    )
    
    for module, _ in parameters_to_prune:
        prune.remove(module, 'weight')
        
    return model
    
checkpoint = torch.load(f'my_experiments/soup_slim_p10k_h_m_image_net_happy_whale_w_03.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual   
model.load_state_dict(checkpoint)
model = prune_model_v1(model)
model.half()
torch.save(model.state_dict(), f'my_experiments/prunning_test.pt')

prunning conv1
prunning transformer.resblocks.0.attn.out_proj
prunning transformer.resblocks.0.mlp.c_fc
prunning transformer.resblocks.0.mlp.c_proj
prunning transformer.resblocks.1.attn.out_proj
prunning transformer.resblocks.1.mlp.c_fc
prunning transformer.resblocks.1.mlp.c_proj
prunning transformer.resblocks.2.attn.out_proj
prunning transformer.resblocks.2.mlp.c_fc
prunning transformer.resblocks.2.mlp.c_proj
prunning transformer.resblocks.3.attn.out_proj
prunning transformer.resblocks.3.mlp.c_fc
prunning transformer.resblocks.3.mlp.c_proj
prunning transformer.resblocks.4.attn.out_proj
prunning transformer.resblocks.4.mlp.c_fc
prunning transformer.resblocks.4.mlp.c_proj
prunning transformer.resblocks.5.attn.out_proj
prunning transformer.resblocks.5.mlp.c_fc
prunning transformer.resblocks.5.mlp.c_proj
prunning transformer.resblocks.6.attn.out_proj
prunning transformer.resblocks.6.mlp.c_fc
prunning transformer.resblocks.6.mlp.c_proj
prunning transformer.resblocks.7.attn.out_proj
prunnin

In [2]:
class Head(nn.Module):
    def __init__(self, hidden_size, slim):
        super(Head, self).__init__()
        self.emb = nn.Linear(hidden_size, CFG.emb_size, bias=False)
        self.slim = slim
        if not slim:
            self.arc = utilities.ArcMarginProduct_subcenter(CFG.emb_size, CFG.n_classes)

        self.dropout = utilities.Multisample_Dropout()

    def forward(self, x):
        embeddings = self.dropout(x, self.emb)
        if not self.slim:
            output = self.arc(embeddings)
            return output, F.normalize(embeddings)
        return F.normalize(embeddings)
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size, version='v1', slim=False):
        super(Model, self).__init__()
        if version == 'v1': 
            self.encoder = vit_backbone.visual
        elif version == 'v2':
            self.encoder = vit_backbone.visual.trunk
            
        self.head = Head(head_size, slim)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)


In [None]:
path_list =  [
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-cut_out-product-10k/model_best_epoch_2_mAP3_0.55_slim.pt',
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-product-10k/model_best_epoch_3_mAP3_0.54_slim.pt',
  f'my_experiments/ViT-H-14-laion2b_s32b_b79k-None-product-10k/model_best_epoch_2_mAP3_0.53_slim.pt',
  f'my_experiments/vit_h_224_products-10k/model_best_epoch_3_mAP3_0.53_slim.pt'
]

class CFG:
    emb_size=512
    model_name = 'ViT-H-14'
    hidden_layer = 1024
    version = 'v1'
    n_classes=9004


backbone, _, _ = open_clip.create_model_and_transforms(CFG.model_name, None)
# Load models weights
weight_list = []

for path in path_list:
    model = Model(backbone, CFG.hidden_layer, CFG.version, True)
    model.load_state_dict(torch.load(path), strict=False)
    weight_list.append(model.state_dict())

# Average weights
state_dict = dict((k, torch.stack([v[k] for v in weight_list]).mean(0)) for k in weight_list[0])
model.load_state_dict(state_dict)

torch.save(model.state_dict(), f'my_experiments/{CFG.model_name}-soup.pt')

In [None]:
def slim_model(model_path, CFG):
    name = os.path.splitext(model_path)[0]
    
    checkpoint = torch.load(model_path)
    backbone, _, _ = open_clip.create_model_and_transforms(CFG.model_name, None)

    model = Model(backbone, CFG.hidden_layer, CFG.version)
    model.load_state_dict(checkpoint['model_state_dict'])

    model_slim = Model(backbone, CFG.hidden_layer, CFG.version, True)
    model_slim.head.emb = model.head.emb
    model_slim.encoder = model.encoder

    torch.save(model_slim.state_dict(), 
               name + '_slim.pt')
    
class CFG:
    emb_size=512
    model_name = 'ViT-H-14'
    hidden_layer = 1024
    version = 'v1'
    n_classes=9691
    

slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_1_mAP5_0.55.pt', CFG())
slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_2_mAP5_0.55.pt', CFG())
slim_model(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-product-10k-all/model_epoch_3_mAP5_0.55.pt', CFG())


In [2]:
backbone, t, p = open_clip.create_model_and_transforms('ViT-H-14', None)
t, p

(Compose(
     RandomResizedCrop(size=(224, 224), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic), antialias=None)
     <function _convert_to_rgb at 0x7fb1993af9d0>
     ToTensor()
     Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
 ),
 Compose(
     Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
     CenterCrop(size=(224, 224))
     <function _convert_to_rgb at 0x7fb1993af9d0>
     ToTensor()
     Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
 ))

In [None]:

checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)

model = Model(backbone, 1024, 'v2')
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = Model(backbone, 1024, 'v2', True)
model_slim.head.emb = model.head.emb
model_slim.encoder = model.encoder


torch.save(model_slim.state_dict(), 
           f'my_experiments/ViT-L-14-laion2b_s32b_b82k-image_net-v2-product-10k-ArcFace(k=3)-All/model_best_epoch_4_mAP3_0.55_slim.pt')

In [2]:
import utilities
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import open_clip
import os

class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = None

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    
checkpoint = torch.load(f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-p10k-h&m-amazon-2-Arcface(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.55.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'], strict=False)

model_slim = model.encoder
model_slim.half()

torch.save(model_slim.state_dict(), 
           f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-p10k-h&m-amazon-2-Arcface(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.55_slim.pt')

In [1]:
class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = utilities.ArcMarginProduct_subcenter(768, 9691, 1)

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    
checkpoint = torch.load(f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-L-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = model.encoder


torch.save(model_slim.state_dict(), 
           f'/home/cemmi/Documents/aicrowd/G-Universal-CLIP/my_experiments/ViT-H-14-laion2b_s32b_b79k-happy_whale-v2-product-10k-ArcFace(k=3)-All-Epoch(10)-Reduce_LR_0.1/model_epoch_2_mAP3_0.57_slim.pt')

NameError: name 'nn' is not defined

In [None]:
checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_best_epoch_2_mAP3_0.57_slim.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = backbone.visual
model.load_state_dict(checkpoint)

In [1]:
import torch
import open_clip
import torch.nn as nn
import torch.nn.functional as F
import utilities

class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()
        self.arc = utilities.ArcMarginProduct_subcenter(1024, 9004, 3)

    def forward(self, x):
        embeddings = x
        output = self.arc(embeddings)
        return output, F.normalize(embeddings)
    
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size):
        super(Model, self).__init__()
        self.encoder = vit_backbone.visual   
        self.head = Head(head_size)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)
    

checkpoint = torch.load(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57.pt')
backbone, _, _ = open_clip.create_model_and_transforms('ViT-H-14', None)
model = Model(backbone, 1024)
model.load_state_dict(checkpoint['model_state_dict'])

model_slim = model.encoder
model_slim.half()

frozen_mod = torch.jit.optimize_for_inference(torch.jit.script(model_slim.eval()))

In [2]:
frozen_mod.save(f'my_experiments/ViT-H-14-laion2b_s32b_b79k-image_net-v2-product-10k/model_epoch_2_mAP3_0.57_slim_f16_jit.pt')

In [22]:
a = th.rand((10000, 512)).cuda()
b = th.rand((9000, 512)).cuda()


In [38]:
import torch as th


c = th.cdist(b, a)
i = th.argsort(c, dim=1)[:, :1000]

In [41]:
c[0].argmin()

tensor(3824, device='cuda:0')

In [42]:
i[0][0]

tensor(3824, device='cuda:0')

In [None]:
    def predict_product_ranks(self):
        """
        This function should return a numpy array of shape `(num_queries, 1000)`. 
        For ach query image your model will need to predict 
        a set of 1000 unique gallery indexes, in order of best match first.

        Outputs:
            class_ranks - A 2D numpy array where the axes correspond to:
                          axis 0 - Batch size
                          axis 1 - An ordered rank list of matched image indexes, most confident prediction first
                            - maximum length of this should be 1000
                            - predictions above this limit will be dropped
                            - duplicates will be dropped such that the lowest index entry is preserved
        """

        gallery_dataset = SubmissionDataset(
            root=self.dataset_path, annotation_file=self.gallery_csv_path,
            transforms=self.final_transform
        )

        gallery_loader = DataLoader(
            gallery_dataset, batch_size=self.batch_size,
            shuffle=False, pin_memory=True, num_workers=self.num_workers
        )

        query_dataset = SubmissionDataset(
            root=self.dataset_path, annotation_file=self.queries_csv_path,
            transforms=self.final_transform, with_bbox=True
        )

        query_loader = DataLoader(
            query_dataset, batch_size=self.batch_size,
            shuffle=False, pin_memory=True, num_workers=self.num_workers
        )

        print('Calculating embeddings')
        gallery_embeddings = th.zeros((len(gallery_dataset), self.embedding_shape), dtype=th.float16, device=self.device)
        query_embeddings = th.zeros((len(query_dataset), self.embedding_shape), dtype=th.float16, device=self.device)
        s = time.time()
        with th.no_grad():
            for i, images in tqdm(enumerate(gallery_loader),
                                total=len(gallery_loader)):
                images = images.to(self.device)
                outputs = self.model(images.half())
                outputs = th.squeeze(outputs.data)
                gallery_embeddings[
                    i*self.batch_size:(i*self.batch_size + self.batch_size), :
                ] = outputs
            
            for i, images in tqdm(enumerate(query_loader),
                                total=len(query_loader)):
                images = images.to(self.device)
                outputs = self.model(images.half())
                outputs = th.squeeze(outputs.data)
                query_embeddings[
                    i*self.batch_size:(i*self.batch_size + self.batch_size), :
                ] = outputs
        e = time.time()
        print(f'Total Time Passed to Calculate the Embeddings {e - s}s')

        print('Calculating distances')
        s = time.time()        
        dist = th.cdist(query_embeddings, gallery_embeddings)
        ind = th.argsort(dist, dim=1)[:, :1000]
        ind = ind.data.cpu().numpy()
        
        e = time.time()
        print(f'Total Time Passed to Calculate Distance {e - s}s')
        return ind

In [21]:
from ultralytics import YOLO
import torch as th


model = YOLO("yolov8n.pt")
image = th.rand((16,3,320,640))
results = model(image)

Ultralytics YOLOv8.0.48 🚀 Python-3.8.16 torch-1.13.1+cu117 CUDA:0 (NVIDIA GeForce RTX 3090, 24252MiB)
YOLOv8n summary (fused): 168 layers, 3151904 parameters, 0 gradients, 8.7 GFLOPs



Exception: Unsupported type encountered! See docs for supported types https://docs.ultralytics.com/predict

In [10]:
from ultralytics import YOLO
import torch as th


model = YOLO("yolov8n.pt").model.to('cuda')
image = th.rand((16,3,320,640), device='cuda')
results = model(image)

In [14]:
a, b = results
a.shape

torch.Size([16, 84, 4200])

In [17]:
b1, b2, b3 = b
b1.shape, b2.shape, b3.shape

(torch.Size([16, 144, 40, 80]),
 torch.Size([16, 144, 20, 40]),
 torch.Size([16, 144, 10, 20]))

In [19]:
ultralytics.__version__

'8.0.48'