In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [5]:
import torch
from collections import OrderedDict
import time

from torchvision.models import resnet50, resnet34, resnet18
from mobilenetv2 import MobileNetV2

import sys
sys.path.append('../')

from flopco import FlopCo

In [6]:
torch.backends.cudnn.enabled

True

In [7]:
mresnet50 = resnet50()
mresnet18 = resnet18()
mresnet34 = resnet34()


mb_profiles = ['normal', '0.7flops']
mb_pathes = ['mobilenetv2_imagenet_71.814.pth.tar',\
             'mobilenetv2_imagenet_0.7amc_70.854.pth.tar',\
#              'mobilenet_imagenet_0.5flops_70.5.pth.tar', \
#              'mobilenet_imagenet_0.5time_70.2.pth.tar'
            ]

path_to_mbnet = "/gpfs/gpfs0/y.gusak/pretrained/amc-compressed-models"
img_size = (1, 3, 224, 224)

In [8]:
mresnet50_flops = FlopCo(model=mresnet50, img_size=img_size)
mresnet34_flops = FlopCo(model=mresnet34, img_size=img_size)
mresnet18_flops = FlopCo(model=mresnet18, img_size=img_size)


for mb_profile, mbpath in zip(mb_profiles[::-1], mb_pathes[::-1]):

    mbnet = MobileNetV2(profile = mb_profile)

    model_dict = mbnet.state_dict()
    pretrained_dict = torch.load('{}/{}'.format(path_to_mbnet, mbpath),
                                 map_location='cpu')['state_dict']
    pretrained_dict = {k.strip('.module') : v for k,v in pretrained_dict.items()}
    model_dict.update(pretrained_dict)

    mbnet.load_state_dict(model_dict)
    
    mbnet_flops = FlopCo(model=mbnet, img_size=img_size)    
    print(mbpath, mresnet50_flops.total_flops/mbnet_flops.total_flops,\
          mresnet34_flops.total_flops/mbnet_flops.total_flops,\
          mresnet18_flops.total_flops/mbnet_flops.total_flops)  

mobilenetv2_imagenet_0.7amc_70.854.pth.tar 18.65939792416775 16.650757185855568 8.253485538552349
mobilenetv2_imagenet_71.814.pth.tar 13.10637595631399 11.69550510268425 5.797254812716178


In [9]:
mresnet50.to('cuda')
mresnet34.to('cuda')
mresnet18.to('cuda')

mbnet.to('cuda')

x = torch.randn(1, 3, 224, 224).cuda()

In [10]:
%timeit y = mresnet50(x)

9.53 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%timeit y = mresnet34(x)

7.39 ms ± 464 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%timeit y = mresnet18(x)

4.23 ms ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%timeit y = mbnet(x)

8.52 ms ± 751 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
mresnet50.to('cuda')
mresnet34.to('cuda')
mresnet18.to('cuda')

mbnet.to('cuda')

x = torch.randn(32, 3, 224, 224).cuda()

In [25]:
%timeit y = mresnet50(x)

41.1 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
%timeit y = mresnet34(x)

19.5 ms ± 421 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
%timeit y = mresnet18(x)

11.4 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
%timeit y = mbnet(x)

11.7 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
mresnet50.to('cpu')
mresnet34.to('cpu')
mresnet18.to('cpu')

mbnet.to('cpu')

x = torch.randn(1, 3, 224, 224)

In [15]:
%timeit y = mresnet50(x)

9.85 s ± 123 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit y = mresnet34(x)

6.7 s ± 128 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit y = mresnet18(x)

3.69 s ± 151 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%timeit y = mbnet(x)

9.48 s ± 425 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
