In [1]:
from copy import deepcopy

import torch
import tensorly as tl
from torchvision.models import resnet18
from tensorly.decomposition import tucker, parafac
from flopco import FlopCo
import warnings
import gc

tl.set_backend("pytorch")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
in_channels = 64
out_channels = 128
kernel_size = (3, 3)
tensor_size = 7
# rank_CPD = 9
# rank_TKD = (256, 101, 9)
number_of_images = 128

## Original conv

In [3]:
full_conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size, dtype=torch.float32)

In [4]:
random_tensor = torch.rand(number_of_images, in_channels, tensor_size, tensor_size, dtype=torch.float32)

In [5]:
# %%timeit -r 10 -n 10000
# full_conv(random_tensor)

## CPD-only conv

In [6]:
def SVD_conv(conv_layer: torch.nn.Conv2d, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    stride = conv_layer.stride
    matrix = conv_layer.weight.squeeze().squeeze()
    if rank_CPD is None:
        rank_CPD = min(matrix.shape)
        
    core, factors = parafac(matrix, rank_CPD, init="random")
    norm = tl.norm(matrix - tl.cp_to_tensor((core, factors))) / tl.norm(matrix)
    print(f"SVD ({in_channels}, {out_channels}, (1, 1)): {norm}")

    factor_CPD_input = factors[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_CPD_output = factors[0].unsqueeze(2).unsqueeze(3)
    

    conv1 = torch.nn.Conv2d(in_channels, rank_CPD, 1, stride=stride, dtype=torch.float32)
    conv2 = torch.nn.Conv2d(rank_CPD, out_channels, 1, dtype=torch.float32)
    conv1.weight = torch.nn.parameter.Parameter(factor_CPD_input)
    conv2.weight = torch.nn.parameter.Parameter(factor_CPD_output)
    return torch.nn.Sequential(conv1, conv2), norm

In [7]:
def CPD_conv(conv_layer: torch.nn.Conv2d, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, rank_CPD)
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)
    
    if rank_CPD is None:
        rank_CPD = sorted(conv_weight.size())[0]
    # elif rank_CPD > sorted(conv_weight.size())[0]:
    #     rank_CPD = sorted(conv_weight.size())[0]
    #     warnings.warn(
    #         f"""
    #         rank_CPD > min(f{conv_weight.size()})
    #         rank_CPD is bigger than the smallest size of tensor dimension
    #         rank_CPD is set to min(f{conv_weight.size()})
    #         """
    #     )

    core_CPD, factors_CPD = parafac(conv_weight, rank_CPD, init="random", svd="randomized_svd")
    norm = tl.norm(conv_weight - tl.cp_to_tensor((core_CPD, factors_CPD))) / tl.norm(conv_weight)
    print(f"CPD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_CPD_input = factors_CPD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_CPD_hidden = factors_CPD[2].permute([1, 0]).unsqueeze(1).reshape(rank_CPD, 1, kernel_size_x, kernel_size_y)
    factor_CPD_output = factors_CPD[0].unsqueeze(2).unsqueeze(3)

    conv1_CPD = torch.nn.Conv2d(in_channels, rank_CPD, 1, dtype=torch.float32)
    conv2_CPD = torch.nn.Conv2d(rank_CPD, rank_CPD, (kernel_size_x, kernel_size_y), groups=rank_CPD, stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv3_CPD = torch.nn.Conv2d(rank_CPD, out_channels, 1, dtype=torch.float32)
    conv1_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_input)
    conv2_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_hidden)
    conv3_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_output)
    
    return torch.nn.Sequential(conv1_CPD, conv2_CPD, conv3_CPD), norm

In [8]:
conv_CPD = CPD_conv(full_conv, rank_CPD=64)[0]

CPD (64, 128, (3, 3)): 0.820536732673645


In [9]:
# %%timeit -r 10 -n 10000
# conv_CPD(random_tensor)

In [10]:
del conv_CPD
torch.cuda.empty_cache()
gc.collect()

64

## TKD-only conv

In [11]:
def TKD_conv(conv_layer: torch.nn.Conv2d, rank_TKD: tuple[int, int, int]=None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, min(rank_TKD))
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)
    
    if rank_TKD is None:
        rank_TKD = (out_channels, in_channels, kernel_size_x * kernel_size_y)
    else:
        if rank_TKD[0] > out_channels:
            rank_TKD = (out_channels, rank_TKD[1], rank_TKD[2])
            warnings.warn("rank_TKD[0] is bigger then out_channels")
        if rank_TKD[1] > in_channels:
            rank_TKD = (rank_TKD[0], in_channels, rank_TKD[2])
            warnings.warn("rank_TKD[1] is bigger then in_channels")
        if rank_TKD[2] > kernel_size_x * kernel_size_y:
            rank_TKD = (rank_TKD[0], rank_TKD[1], kernel_size_x * kernel_size_y)
            warnings.warn("rank_TKD[2] is bigger then kernel_size_x * kernel_size_y")
    
    core_TKD, factors_TKD = tucker(conv_weight, rank_TKD)
    norm = tl.norm(conv_weight - tl.tucker_to_tensor((core_TKD, factors_TKD))) / tl.norm(conv_weight)
    print(f"TKD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_TKD_input = factors_TKD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_TKD_hidden = torch.tensordot(factors_TKD[2], core_TKD, dims=([1], [2])).permute([1, 2, 0]).reshape(rank_TKD[0], rank_TKD[1], kernel_size_x, kernel_size_y)
    factor_TKD_output = factors_TKD[0].unsqueeze(2).unsqueeze(3)

    conv1_TKD = torch.nn.Conv2d(in_channels, rank_TKD[1], 1, dtype=torch.float32)
    conv2_TKD = torch.nn.Conv2d(rank_TKD[1], rank_TKD[0], (kernel_size_x, kernel_size_y), stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv3_TKD = torch.nn.Conv2d(rank_TKD[0], out_channels, 1, dtype=torch.float32)
    conv1_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_input)
    conv2_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_hidden)
    conv3_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_output)
    
    return torch.nn.Sequential(conv1_TKD, conv2_TKD, conv3_TKD), norm

In [12]:
conv_TKD = TKD_conv(full_conv)[0]

TKD (64, 128, (3, 3)): 1.2182272257632576e-06


In [13]:
# %%timeit -r 10 -n 10000
# conv_TKD(random_tensor)

In [14]:
del conv_TKD
torch.cuda.empty_cache()
gc.collect()

0

## TKD-CPD conv

In [15]:
def TKDCPD_conv(conv_layer: torch.nn.Conv2d, rank_TKD:tuple[int, int, int] = None, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, rank_CPD)
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)

    if rank_TKD is None:
        rank_TKD = (out_channels, in_channels, kernel_size_x * kernel_size_y)
    else:
        if rank_TKD[0] > out_channels:
            rank_TKD = (out_channels, rank_TKD[1], rank_TKD[2])
            warnings.warn(f"rank_TKD[0] is bigger then out_channels\n\nrank_TKD[0]={rank_TKD[0]}\nout_channels={out_channels}")
        if rank_TKD[1] > in_channels:
            rank_TKD = (rank_TKD[0], in_channels, rank_TKD[2])
            warnings.warn(f"rank_TKD[1] is bigger then in_channels\n\nrank_TKD[1]={rank_TKD[1]}\nin_channels={in_channels}")
        if rank_TKD[2] > kernel_size_x * kernel_size_y:
            rank_TKD = (rank_TKD[0], rank_TKD[1], kernel_size_x * kernel_size_y)
            warnings.warn(f"rank_TKD[2] is bigger then kernel_size_x * kernel_size_y\nrank_TKD[2]={rank_TKD[2]}\nkernel_size_x * kernel_size_y={kernel_size_x * kernel_size_y}")
    
    core_TKD, factors_TKD = tucker(conv_weight, rank_TKD)
    norm = tl.norm(conv_weight - tl.tucker_to_tensor((core_TKD, factors_TKD))) / tl.norm(conv_weight)
    print(f"TKDCPD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_TKD_input = factors_TKD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_TKD_hidden = torch.tensordot(factors_TKD[2], core_TKD, dims=([1], [2])).permute([1, 2, 0]).reshape(rank_TKD[0], rank_TKD[1], kernel_size_x, kernel_size_y)
    factor_TKD_output = factors_TKD[0].unsqueeze(2).unsqueeze(3)

    conv2_TKD = torch.nn.Conv2d(rank_TKD[1], rank_TKD[0], (kernel_size_x, kernel_size_y), stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv2_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_hidden)
    conv2_TKD = CPD_conv(conv2_TKD, rank_CPD=rank_CPD)
    norm = conv2_TKD[1]
    conv2_TKD = conv2_TKD[0]

    conv1_TKD = torch.nn.Conv2d(in_channels, rank_TKD[1], 1, dtype=torch.float32)
    conv3_TKD = torch.nn.Conv2d(rank_TKD[0], out_channels, 1, dtype=torch.float32)
    conv1_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_input)
    conv3_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_output)

    return torch.nn.Sequential(conv1_TKD, conv2_TKD, conv3_TKD), norm

In [16]:
conv_TKDCPD = TKDCPD_conv(full_conv)

TKDCPD (64, 128, (3, 3)): 1.2182272257632576e-06
CPD (64, 128, (3, 3)): 0.9747201800346375


In [17]:
# %%timeit -r 10 -n 10000
# conv_TKDCPD(random_tensor)

In [18]:
del conv_TKDCPD
torch.cuda.empty_cache()
gc.collect()

0

## STATS

In [19]:
# stats_conv = FlopCo(full_conv, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_CPD = FlopCo(conv_CPD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_TKD = FlopCo(conv_TKD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_TKDCPD = FlopCo(conv_TKDCPD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)

In [20]:
# print("Normal conv:")
# print(stats_conv)
# print("\nCPD:")
# print(stats_CPD)
# print("\nTKD:")
# print(stats_TKD)
# print("\nTKDCPD:")
# print(stats_TKDCPD)

In [21]:
# print("FLOPS")
# print(f"{stats_conv.total_flops/10e6}e6; {stats_CPD.total_flops/10e6}e6; {stats_TKD.total_flops/10e6}e6; {stats_TKDCPD.total_flops/10e6}e6")
# print("\nRelative Flops")
# print(f"{stats_conv.relative_flops}\n{stats_CPD.relative_flops}\n{stats_TKD.relative_flops}\n{stats_TKDCPD.relative_flops}")
# print("\nParameters")
# print(f"{stats_conv.total_params}; {stats_CPD.total_params}; {stats_TKD.total_params}; {stats_TKDCPD.total_params}")
# print("\nRelative parameters")
# print(f"{stats_conv.relative_params}\n{stats_CPD.relative_params}\n{stats_TKD.relative_params}\n{stats_TKDCPD.relative_params}")

## Convert RESNET

In [22]:
def replace_conv_layers(module, conv_func):
    for name, child in module.named_children():
        if isinstance(child, torch.nn.Conv2d):
            weight = child.weight.size()
            TKD, _ = conv_func(child, [max(weight[0], 1), max(weight[1], 1), weight[2] * weight[3]])
            setattr(module, name, TKD)
        else:
            replace_conv_layers(child, conv_func)

In [23]:
import os
import json

def save_results(conv_func_name, rank, stats, norms):
    # Define the directory structure
    dir_path = f"./{conv_func_name}/{rank}/"

    # Create the directory if it doesn't exist
    os.makedirs(dir_path, exist_ok=True)

    # Save FLOPS and parameter counts
    stats_data = {
        "total_flops": stats.total_flops,
        "total_params": stats.total_params,
        "relative_flops": stats.relative_flops,
        "relative_params": stats.relative_params
    }
    with open(f"{dir_path}flops_params.json", "w") as f:
        json.dump(stats_data, f, indent=4)

    # Save norms of each layer
    with open(f"{dir_path}layer_norms.txt", "w") as f:
        for layer_name, norm_value in norms.items():
            f.write(f"{layer_name}: {norm_value}\n")
    print(f"Results saved in {dir_path}")

# Define the range of ranks you want to iterate over
rank_range = range(41, 42)

# Initialize your ResNet model
resnet_model = resnet18(weights='DEFAULT')

# Loop over the specified ranks and compress the ResNet with each rank
for rank in rank_range:
    print(f"Compressing with rank_CPD = {rank}")

    # Clone the model to avoid overwriting the original
    model_copy = resnet18(weights='DEFAULT')
    
    # Compress the model, storing norms of each layer
    replace_conv_layers(model_copy, TKD_conv)

    # Capture statistics with FlopCo
    stats_CPD = FlopCo(model_copy, device="cpu")

    # Save the results
    # save_results("CPD_conv", rank, stats_CPD, 0)

    # Clear cache
    del model_copy
    torch.cuda.empty_cache()
    gc.collect()


Compressing with rank_CPD = 41
TKD (3, 64, (7, 7)): 9.743340569912107e-07
TKD (64, 64, (3, 3)): 1.1181640502400114e-06
TKD (64, 64, (3, 3)): 1.1904247685379232e-06
TKD (64, 64, (3, 3)): 1.0448459306644509e-06
TKD (64, 64, (3, 3)): 1.1266018873357098e-06
TKD (64, 128, (3, 3)): 1.2282486068215803e-06
TKD (128, 128, (3, 3)): 1.2825962585338857e-06
SVD (64, 128, (1, 1)): 0.9541822075843811
TKD (128, 128, (3, 3)): 1.3969146266390453e-06
TKD (128, 128, (3, 3)): 1.3236360700830119e-06
TKD (128, 256, (3, 3)): 1.5402448525492218e-06
TKD (256, 256, (3, 3)): 1.656385620663059e-06
SVD (128, 256, (1, 1)): 0.9790937304496765
TKD (256, 256, (3, 3)): 1.7054817362804897e-06
TKD (256, 256, (3, 3)): 1.6958357491603238e-06
TKD (256, 512, (3, 3)): 1.793696469576389e-06
TKD (512, 512, (3, 3)): 1.972041218323284e-06
SVD (256, 512, (1, 1)): 0.9850214719772339
TKD (512, 512, (3, 3)): 1.971262690858566e-06
TKD (512, 512, (3, 3)): 1.927569201143342e-06


## Test compressed

In [24]:
compressed_resnet18 = resnet18(weights='DEFAULT')
replace_conv_layers(compressed_resnet18, TKDCPD_conv)

TKDCPD (3, 64, (7, 7)): 9.743340569912107e-07
CPD (3, 64, (7, 7)): 0.8474209904670715
TKDCPD (64, 64, (3, 3)): 1.1181640502400114e-06
CPD (64, 64, (3, 3)): 0.7477900385856628
TKDCPD (64, 64, (3, 3)): 1.1904247685379232e-06
CPD (64, 64, (3, 3)): 0.8478550910949707
TKDCPD (64, 64, (3, 3)): 1.0448459306644509e-06
CPD (64, 64, (3, 3)): 0.8203519582748413
TKDCPD (64, 64, (3, 3)): 1.1266018873357098e-06
CPD (64, 64, (3, 3)): 0.8956947922706604
TKDCPD (64, 128, (3, 3)): 1.2282486068215803e-06
CPD (64, 128, (3, 3)): 0.9195433855056763
TKDCPD (128, 128, (3, 3)): 1.2825962585338857e-06
CPD (128, 128, (3, 3)): 0.894120454788208
SVD (64, 128, (1, 1)): 0.0019077861215919256
TKDCPD (128, 128, (3, 3)): 1.3969146266390453e-06
CPD (128, 128, (3, 3)): 0.932137131690979
TKDCPD (128, 128, (3, 3)): 1.3236360700830119e-06
CPD (128, 128, (3, 3)): 0.9149534702301025
TKDCPD (128, 256, (3, 3)): 1.5402448525492218e-06
CPD (128, 256, (3, 3)): 0.9266303181648254
TKDCPD (256, 256, (3, 3)): 1.656385620663059e-06
CPD

In [25]:
test_tensor = torch.rand(128, 3, 224, 224).to(device)

In [26]:
uncompressed_resnet18 = resnet18(weights='DEFAULT').to(device).eval()
compressed_resnet18 = compressed_resnet18.to(device).eval()

In [27]:
# %%timeit -r 10 -n 100
# with torch.amp.autocast('cuda'):
#     uncompressed_resnet18(test_tensor)

In [28]:
# %%timeit -r 10 -n 100
# with torch.amp.autocast('cuda'):
#     compressed_resnet18(test_tensor)

In [29]:
stats_uncompressed = FlopCo(uncompressed_resnet18, device=device)
stats_compressed = FlopCo(compressed_resnet18, device=device)

In [30]:
print("FLOPS")
print(f"{stats_uncompressed.total_flops/10e6}e6; {stats_compressed.total_flops/10e6}e6")
print("\nParameters")
print(f"{stats_uncompressed.total_params}; {stats_compressed.total_params}")

FLOPS
362.81472e6; 106.1259415e6

Parameters
11679912; 3381958


In [31]:
compressed_resnet18

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1))
    (1): Sequential(
      (0): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1))
      (1): Conv2d(3, 3, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), groups=3)
      (2): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1))
    )
    (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): Sequential(
          (0): Conv2d(64, 9, kernel_size=(1, 1), stride=(1, 1))
          (1): Conv2d(9, 9, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=9)
          (2): Conv2d(9, 64, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): Conv2d(64, 6

In [32]:
dict(sorted(stats_uncompressed.relative_params.items(), key=lambda item: item[1], reverse=True))

{'layer4.0.conv2': 0.20199604243593616,
 'layer4.1.conv1': 0.20199604243593616,
 'layer4.1.conv2': 0.20199604243593616,
 'layer4.0.conv1': 0.10099802121796808,
 'layer3.0.conv2': 0.05049901060898404,
 'layer3.1.conv1': 0.05049901060898404,
 'layer3.1.conv2': 0.05049901060898404,
 'fc': 0.04392156379260392,
 'layer3.0.conv1': 0.02524950530449202,
 'layer2.0.conv2': 0.01262475265224601,
 'layer2.1.conv1': 0.01262475265224601,
 'layer2.1.conv2': 0.01262475265224601,
 'layer4.0.downsample.0': 0.011222002357552009,
 'layer2.0.conv1': 0.006312376326123005,
 'layer1.0.conv1': 0.0031561881630615025,
 'layer1.0.conv2': 0.0031561881630615025,
 'layer1.1.conv1': 0.0031561881630615025,
 'layer1.1.conv2': 0.0031561881630615025,
 'layer3.0.downsample.0': 0.002805500589388002,
 'conv1': 0.0008054855207813209,
 'layer2.0.downsample.0': 0.0007013751473470006}

In [33]:
dict(sorted(stats_compressed.relative_params.items(), key=lambda item: item[1], reverse=True))

{'fc': 0.1516872770152675,
 'layer4.0.conv1.2': 0.07766388583181695,
 'layer4.0.conv2.0': 0.07766388583181695,
 'layer4.0.conv2.2': 0.07766388583181695,
 'layer4.1.conv1.0': 0.07766388583181695,
 'layer4.1.conv1.2': 0.07766388583181695,
 'layer4.1.conv2.0': 0.07766388583181695,
 'layer4.1.conv2.2': 0.07766388583181695,
 'layer4.0.downsample.0.1': 0.03890763871106619,
 'layer3.0.conv1.2': 0.019453819355533097,
 'layer3.0.conv2.0': 0.019453819355533097,
 'layer3.0.conv2.2': 0.019453819355533097,
 'layer3.1.conv1.0': 0.019453819355533097,
 'layer3.1.conv1.2': 0.019453819355533097,
 'layer3.1.conv2.0': 0.019453819355533097,
 'layer3.1.conv2.2': 0.019453819355533097,
 'layer4.0.conv1.0': 0.019453819355533097,
 'layer4.0.downsample.0.0': 0.019453819355533097,
 'layer3.0.downsample.0.1': 0.009764757575345406,
 'layer2.0.conv1.2': 0.004882378787672703,
 'layer2.0.conv2.0': 0.004882378787672703,
 'layer2.0.conv2.2': 0.004882378787672703,
 'layer2.1.conv1.0': 0.004882378787672703,
 'layer2.1.con

# Evaluate

In [34]:
test = torch.randn([128, 3, 224, 224]).to(device)

In [35]:
a = torch.argmax(uncompressed_resnet18(test).cpu().detach(), dim=1)

In [36]:
b = torch.argmax(compressed_resnet18(test).cpu().detach(), dim=1)

In [37]:
torch.sum(a == b)

tensor(0)

## Try to finetune model

In [38]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

torch.cuda.empty_cache()

# Instantiate models
reference_model = uncompressed_resnet18.eval()
trainable_model = compressed_resnet18.train()

# Loss function: CrossEntropyLoss
criterion = nn.CrossEntropyLoss()

# Optimizer for the trainable model
optimizer = optim.Adam(trainable_model.parameters(), lr=0.0001)

# Training loop without a dataset
epochs = 1
batch_size = 256
input_shape = (3, 224, 224)  # Image-like input (channels, height, width)
batches = 100

for epoch in range(epochs):
    running_loss = 0.0
    # Use tqdm to track progress
    with tqdm(total=batches, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
        for _ in range(batches):  # Simulate 100 batches per epoch
            # Generate random input data (like random images)
            inputs = torch.randn(batch_size, *input_shape, device=device)

            # Get the reference model's output (detach to avoid backprop on reference model)
            with torch.amp.autocast('cuda'):
                with torch.no_grad():
                    reference_outputs = reference_model(inputs)

                # Forward pass of the trainable model
                model_outputs = trainable_model(inputs)

            # Compute the loss between the two model outputs
            # We need to find the class with maximum probability in the reference outputs for CrossEntropy
            # Assuming reference outputs are logits (not probabilities)
            _, target_indices = torch.max(reference_outputs, dim=1)  # Get class indices for the targets

            # Compute the Cross-Entropy loss
            loss = criterion(model_outputs, target_indices)

            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Update the progress bar
            pbar.set_postfix(loss=loss.item())
            pbar.update(1)  # Increment the progress bar by 1

    # Print the average loss for the epoch
    avg_loss = running_loss / batches
    print(f"Epoch [{epoch + 1}/{epochs}], Average Loss: {avg_loss:.4f}")

print("Training complete!")

Epoch 1/1: 100%|██████████| 100/100 [00:37<00:00,  2.67batch/s, loss=0.0553]

Epoch [1/1], Average Loss: 1.2854
Training complete!





# Evaluate

In [39]:
test = torch.randn([128, 3, 224, 224]).to(device)

In [40]:
a = torch.argmax(reference_model(test).cpu().detach(), dim=1)

In [41]:
b = torch.argmax(trainable_model(test).cpu().detach(), dim=1)

In [42]:
torch.sum(a.eq(b))

tensor(128)

# Test with model_compressor

In [1]:
from src.model_compressor import model_compressor
import torch
from torchvision.models import resnet18
from copy import deepcopy
import gc
from torch import nn, optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random_tensor = torch.randn([64, 3, 224, 224])

In [2]:
resnet_original = resnet18(weights='DEFAULT')

In [3]:
resnet_compressed = deepcopy(resnet_original)

In [4]:
model_compressor.compress_model(resnet_compressed, conv_compression_method="TKDCPD", rank_cpd=2, finetune=True, epochs=1, batch_size=64, number_of_iterations=100, data_size=[3, 224, 224], finetune_device=device, loss_function=nn.CrossEntropyLoss, optimizer=optim.Adam, lr=0.0001, task="classification")

differential_evolution step 1: f(x)= 0.10055490507503358
differential_evolution step 2: f(x)= 0.10055490507503358
differential_evolution step 3: f(x)= 0.10055490507503358
differential_evolution step 4: f(x)= 0.10055490507503358
Polishing solution with 'L-BFGS-B'
2.2516214330007642
[np.int64(33), np.int64(3), np.int64(49)]
differential_evolution step 1: f(x)= 0.2150981566052378
differential_evolution step 2: f(x)= 0.2112575950891102
differential_evolution step 3: f(x)= 0.2112575950891102
differential_evolution step 4: f(x)= 0.2112575950891102
Polishing solution with 'L-BFGS-B'
2.3581607340001938
[np.int64(52), np.int64(40), np.int64(9)]
differential_evolution step 1: f(x)= 0.34126967271641434
differential_evolution step 2: f(x)= 0.32210522404514313
differential_evolution step 3: f(x)= 0.32210522404514313
differential_evolution step 4: f(x)= 0.32210522404514313
Polishing solution with 'L-BFGS-B'
2.4207167599997774
[np.int64(46), np.int64(45), np.int64(9)]
differential_evolution step 1: f

In [9]:
resnet_original.cpu()
resnet_compressed.cpu()
resnet_original.eval()
resnet_compressed.eval()

ResNet(
  (conv1): Conv2d(
    3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    (conv1): Sequential(
      (0): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): Sequential(
        (0): Conv2d(3, 2, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): Conv2d(2, 2, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), groups=2, bias=False)
        (2): Conv2d(2, 33, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (2): Conv2d(33, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    )
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (conv1): Sequential(
          (0): Conv2d(64, 40, kernel_size=(1, 1), stride=(1, 1)

In [6]:
result_original = resnet_original(random_tensor)
gc.collect()
torch.cuda.empty_cache()
result_compressed = resnet_compressed(random_tensor)

In [7]:
torch.mean(torch.abs(result_original - result_compressed))

tensor(1.9369, grad_fn=<MeanBackward0>)

In [8]:
torch.argmax(result_original, dim=1).isclose(torch.argmax(result_compressed, dim=1))

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True])