In [1]:
import torch
import tensorly as tl
from torchvision.models import resnet18
from tensorly.decomposition import tucker, parafac
from flopco import FlopCo
import warnings
import gc

tl.set_backend("pytorch")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
in_channels = 64
out_channels = 128
kernel_size = (3, 3)
tensor_size = 7
# rank_CPD = 9
# rank_TKD = (256, 101, 9)
number_of_images = 128

## Original conv

In [3]:
full_conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size, dtype=torch.float32)

In [4]:
random_tensor = torch.rand(number_of_images, in_channels, tensor_size, tensor_size, dtype=torch.float32)

In [5]:
# %%timeit -r 10 -n 10000
# full_conv(random_tensor)

## CPD-only conv

In [6]:
def SVD_conv(conv_layer: torch.nn.Conv2d, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    stride = conv_layer.stride
    matrix = conv_layer.weight.squeeze().squeeze()
    if rank_CPD is None:
        rank_CPD = min(matrix.shape)
        
    core, factors = parafac(matrix, rank_CPD, init="random")
    norm = tl.norm(matrix - tl.cp_to_tensor((core, factors))) / tl.norm(matrix)
    print(f"SVD ({in_channels}, {out_channels}, (1, 1)): {norm}")

    factor_CPD_input = factors[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_CPD_output = factors[0].unsqueeze(2).unsqueeze(3)
    

    conv1 = torch.nn.Conv2d(in_channels, rank_CPD, 1, stride=stride, dtype=torch.float32)
    conv2 = torch.nn.Conv2d(rank_CPD, out_channels, 1, dtype=torch.float32)
    conv1.weight = torch.nn.parameter.Parameter(factor_CPD_input)
    conv2.weight = torch.nn.parameter.Parameter(factor_CPD_output)
    return torch.nn.Sequential(conv1, conv2), norm

In [7]:
def CPD_conv(conv_layer: torch.nn.Conv2d, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, rank_CPD)
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)
    
    if rank_CPD is None:
        rank_CPD = sorted(conv_weight.size())[0]
    # elif rank_CPD > sorted(conv_weight.size())[0]:
    #     rank_CPD = sorted(conv_weight.size())[0]
    #     warnings.warn(
    #         f"""
    #         rank_CPD > min(f{conv_weight.size()})
    #         rank_CPD is bigger than the smallest size of tensor dimension
    #         rank_CPD is set to min(f{conv_weight.size()})
    #         """
    #     )

    core_CPD, factors_CPD = parafac(conv_weight, rank_CPD, init="random", svd="randomized_svd")
    norm = tl.norm(conv_weight - tl.cp_to_tensor((core_CPD, factors_CPD))) / tl.norm(conv_weight)
    print(f"CPD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_CPD_input = factors_CPD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_CPD_hidden = factors_CPD[2].permute([1, 0]).unsqueeze(1).reshape(rank_CPD, 1, kernel_size_x, kernel_size_y)
    factor_CPD_output = factors_CPD[0].unsqueeze(2).unsqueeze(3)

    conv1_CPD = torch.nn.Conv2d(in_channels, rank_CPD, 1, dtype=torch.float32)
    conv2_CPD = torch.nn.Conv2d(rank_CPD, rank_CPD, (kernel_size_x, kernel_size_y), groups=rank_CPD, stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv3_CPD = torch.nn.Conv2d(rank_CPD, out_channels, 1, dtype=torch.float32)
    conv1_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_input)
    conv2_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_hidden)
    conv3_CPD.weight = torch.nn.parameter.Parameter(factor_CPD_output)
    
    return torch.nn.Sequential(conv1_CPD, conv2_CPD, conv3_CPD), norm

In [8]:
conv_CPD = CPD_conv(full_conv, rank_CPD=64)[0]

CPD (64, 128, (3, 3)): 0.8220816850662231


In [9]:
# %%timeit -r 10 -n 10000
# conv_CPD(random_tensor)

In [10]:
del conv_CPD
torch.cuda.empty_cache()
gc.collect()

74

## TKD-only conv

In [11]:
def TKD_conv(conv_layer: torch.nn.Conv2d, rank_TKD: tuple[int, int, int]=None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, min(rank_TKD))
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)
    
    if rank_TKD is None:
        rank_TKD = (out_channels, in_channels, kernel_size_x * kernel_size_y)
    else:
        if rank_TKD[0] > out_channels:
            rank_TKD = (out_channels, rank_TKD[1], rank_TKD[2])
            warnings.warn("rank_TKD[0] is bigger then out_channels")
        if rank_TKD[1] > in_channels:
            rank_TKD = (rank_TKD[0], in_channels, rank_TKD[2])
            warnings.warn("rank_TKD[1] is bigger then in_channels")
        if rank_TKD[2] > kernel_size_x * kernel_size_y:
            rank_TKD = (rank_TKD[0], rank_TKD[1], kernel_size_x * kernel_size_y)
            warnings.warn("rank_TKD[2] is bigger then kernel_size_x * kernel_size_y")
    
    core_TKD, factors_TKD = tucker(conv_weight, rank_TKD)
    norm = tl.norm(conv_weight - tl.tucker_to_tensor((core_TKD, factors_TKD))) / tl.norm(conv_weight)
    print(f"TKD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_TKD_input = factors_TKD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_TKD_hidden = torch.tensordot(factors_TKD[2], core_TKD, dims=([1], [2])).permute([1, 2, 0]).reshape(rank_TKD[0], rank_TKD[1], kernel_size_x, kernel_size_y)
    factor_TKD_output = factors_TKD[0].unsqueeze(2).unsqueeze(3)

    conv1_TKD = torch.nn.Conv2d(in_channels, rank_TKD[1], 1, dtype=torch.float32)
    conv2_TKD = torch.nn.Conv2d(rank_TKD[1], rank_TKD[0], (kernel_size_x, kernel_size_y), stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv3_TKD = torch.nn.Conv2d(rank_TKD[0], out_channels, 1, dtype=torch.float32)
    conv1_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_input)
    conv2_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_hidden)
    conv3_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_output)
    
    return torch.nn.Sequential(conv1_TKD, conv2_TKD, conv3_TKD), norm

In [12]:
conv_TKD = TKD_conv(full_conv)[0]

TKD (64, 128, (3, 3)): 1.3146224091542535e-06


In [13]:
# %%timeit -r 10 -n 10000
# conv_TKD(random_tensor)

In [14]:
del conv_TKD
torch.cuda.empty_cache()
gc.collect()

0

## TKD-CPD conv

In [15]:
def TKDCPD_conv(conv_layer: torch.nn.Conv2d, rank_TKD:tuple[int, int, int] = None, rank_CPD: int = None) -> (torch.nn.Sequential, float):
    if conv_layer.kernel_size == (1, 1):
        return SVD_conv(conv_layer, rank_CPD)
    # Params of source conv_layer
    out_channels = conv_layer.out_channels
    in_channels = conv_layer.in_channels
    kernel_size_x = conv_layer.kernel_size[0]
    kernel_size_y = conv_layer.kernel_size[1]
    stride = conv_layer.stride
    padding = conv_layer.padding
    dilation = conv_layer.dilation
    conv_weight = conv_layer.weight.reshape(out_channels, in_channels, kernel_size_x * kernel_size_y)

    if rank_TKD is None:
        rank_TKD = (out_channels, in_channels, kernel_size_x * kernel_size_y)
    else:
        if rank_TKD[0] > out_channels:
            rank_TKD = (out_channels, rank_TKD[1], rank_TKD[2])
            warnings.warn(f"rank_TKD[0] is bigger then out_channels\n\nrank_TKD[0]={rank_TKD[0]}\nout_channels={out_channels}")
        if rank_TKD[1] > in_channels:
            rank_TKD = (rank_TKD[0], in_channels, rank_TKD[2])
            warnings.warn(f"rank_TKD[1] is bigger then in_channels\n\nrank_TKD[1]={rank_TKD[1]}\nin_channels={in_channels}")
        if rank_TKD[2] > kernel_size_x * kernel_size_y:
            rank_TKD = (rank_TKD[0], rank_TKD[1], kernel_size_x * kernel_size_y)
            warnings.warn(f"rank_TKD[2] is bigger then kernel_size_x * kernel_size_y\nrank_TKD[2]={rank_TKD[2]}\nkernel_size_x * kernel_size_y={kernel_size_x * kernel_size_y}")
    
    core_TKD, factors_TKD = tucker(conv_weight, rank_TKD)
    norm = tl.norm(conv_weight - tl.tucker_to_tensor((core_TKD, factors_TKD))) / tl.norm(conv_weight)
    print(f"TKDCPD ({in_channels}, {out_channels}, ({kernel_size_x}, {kernel_size_y})): {norm}")

    factor_TKD_input = factors_TKD[1].permute([1, 0]).unsqueeze(2).unsqueeze(3)
    factor_TKD_hidden = torch.tensordot(factors_TKD[2], core_TKD, dims=([1], [2])).permute([1, 2, 0]).reshape(rank_TKD[0], rank_TKD[1], kernel_size_x, kernel_size_y)
    factor_TKD_output = factors_TKD[0].unsqueeze(2).unsqueeze(3)

    conv2_TKD = torch.nn.Conv2d(rank_TKD[1], rank_TKD[0], (kernel_size_x, kernel_size_y), stride=stride, padding=padding, dilation=dilation, dtype=torch.float32)
    conv2_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_hidden)
    conv2_TKD = CPD_conv(conv2_TKD, rank_CPD=rank_CPD)
    norm = conv2_TKD[1]
    conv2_TKD = conv2_TKD[0]

    conv1_TKD = torch.nn.Conv2d(in_channels, rank_TKD[1], 1, dtype=torch.float32)
    conv3_TKD = torch.nn.Conv2d(rank_TKD[0], out_channels, 1, dtype=torch.float32)
    conv1_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_input)
    conv3_TKD.weight = torch.nn.parameter.Parameter(factor_TKD_output)

    return torch.nn.Sequential(conv1_TKD, conv2_TKD, conv3_TKD), norm

In [16]:
conv_TKDCPD = TKDCPD_conv(full_conv)

TKDCPD (64, 128, (3, 3)): 1.3146224091542535e-06
CPD (64, 128, (3, 3)): 0.9747400283813477


In [17]:
# %%timeit -r 10 -n 10000
# conv_TKDCPD(random_tensor)

In [18]:
del conv_TKDCPD
torch.cuda.empty_cache()
gc.collect()

0

## STATS

In [19]:
# stats_conv = FlopCo(full_conv, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_CPD = FlopCo(conv_CPD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_TKD = FlopCo(conv_TKD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)
# stats_TKDCPD = FlopCo(conv_TKDCPD, img_size=[number_of_images, in_channels, tensor_size, tensor_size], device=device)

In [20]:
# print("Normal conv:")
# print(stats_conv)
# print("\nCPD:")
# print(stats_CPD)
# print("\nTKD:")
# print(stats_TKD)
# print("\nTKDCPD:")
# print(stats_TKDCPD)

In [21]:
# print("FLOPS")
# print(f"{stats_conv.total_flops/10e6}e6; {stats_CPD.total_flops/10e6}e6; {stats_TKD.total_flops/10e6}e6; {stats_TKDCPD.total_flops/10e6}e6")
# print("\nRelative Flops")
# print(f"{stats_conv.relative_flops}\n{stats_CPD.relative_flops}\n{stats_TKD.relative_flops}\n{stats_TKDCPD.relative_flops}")
# print("\nParameters")
# print(f"{stats_conv.total_params}; {stats_CPD.total_params}; {stats_TKD.total_params}; {stats_TKDCPD.total_params}")
# print("\nRelative parameters")
# print(f"{stats_conv.relative_params}\n{stats_CPD.relative_params}\n{stats_TKD.relative_params}\n{stats_TKDCPD.relative_params}")

## Convert RESNET

In [22]:
def compress_resnet(resnet, conv_func, rank: int = None):
    layer_norms = {}
    for name, module in resnet.named_modules():
        if isinstance(module, torch.nn.Conv2d):
            parent_name = ".".join(name.split(".")[:-1])
            attr_name = name.split(".")[-1]
            print(parent_name, end=": ")
            # Access the parent module
            parent_module = resnet
            if parent_name:
                parent_module = dict(resnet.named_modules())[parent_name]
            
            # Replace the old layer with the new one
            result = conv_func(module, rank_CPD=rank)
            setattr(parent_module, attr_name, result[0])
            layer_norms[parent_name + "." + attr_name] = result[1]
            del result
            gc.collect()
            torch.cuda.empty_cache()
    return resnet, layer_norms

In [23]:
import os
import json

def save_results(conv_func_name, rank, stats, norms):
    # Define the directory structure
    dir_path = f"./{conv_func_name}/{rank}/"

    # Create the directory if it doesn't exist
    os.makedirs(dir_path, exist_ok=True)

    # Save FLOPS and parameter counts
    stats_data = {
        "total_flops": stats.total_flops,
        "total_params": stats.total_params,
        "relative_flops": stats.relative_flops,
        "relative_params": stats.relative_params
    }
    with open(f"{dir_path}flops_params.json", "w") as f:
        json.dump(stats_data, f, indent=4)

    # Save norms of each layer
    with open(f"{dir_path}layer_norms.txt", "w") as f:
        for layer_name, norm_value in norms.items():
            f.write(f"{layer_name}: {norm_value}\n")
    print(f"Results saved in {dir_path}")

# Define the range of ranks you want to iterate over
rank_range = range(41, 42)

# Initialize your ResNet model
resnet_model = resnet18(weights='DEFAULT')

# Loop over the specified ranks and compress the ResNet with each rank
for rank in rank_range:
    print(f"Compressing with rank_CPD = {rank}")

    # Clone the model to avoid overwriting the original
    model_copy = resnet18(weights='DEFAULT')
    
    # Compress the model, storing norms of each layer
    compressed_model, layer_norms = compress_resnet(model_copy, CPD_conv, rank)
    compressed_model = compressed_model.cpu()

    # Capture statistics with FlopCo
    stats_CPD = FlopCo(compressed_model, device="cpu")

    # Save the results
    save_results("CPD_conv", rank, stats_CPD, layer_norms)

    # Clear cache
    del compressed_model
    torch.cuda.empty_cache()
    gc.collect()


Compressing with rank_CPD = 41
: CPD (3, 64, (7, 7)): 0.09475382417440414
layer1.0: CPD (64, 64, (3, 3)): 0.48146572709083557
layer1.0: CPD (64, 64, (3, 3)): 0.5971982479095459
layer1.1: CPD (64, 64, (3, 3)): 0.5880162715911865
layer1.1: CPD (64, 64, (3, 3)): 0.6535841226577759
layer2.0: CPD (64, 128, (3, 3)): 0.710805356502533
layer2.0: CPD (128, 128, (3, 3)): 0.7135886549949646
layer2.0.downsample: SVD (64, 128, (1, 1)): 0.1941254884004593
layer2.1: CPD (128, 128, (3, 3)): 0.7751623392105103
layer2.1: CPD (128, 128, (3, 3)): 0.7718322277069092
layer3.0: CPD (128, 256, (3, 3)): 0.7454242706298828
layer3.0: CPD (256, 256, (3, 3)): 0.8372140526771545
layer3.0.downsample: SVD (128, 256, (1, 1)): 0.5445394515991211
layer3.1: CPD (256, 256, (3, 3)): 0.8586596846580505
layer3.1: CPD (256, 256, (3, 3)): 0.8351826667785645
layer4.0: CPD (256, 512, (3, 3)): 0.8577007055282593
layer4.0: CPD (512, 512, (3, 3)): 0.9186574816703796
layer4.0.downsample: SVD (256, 512, (1, 1)): 0.6602019667625427
la

## Test compressed

In [None]:
compressed_resnet18 = compress_resnet(resnet18, CPD_conv)[0]

In [None]:
test_tensor = torch.rand(128, 3, 224, 224).to(device)

In [None]:
uncompressed_resnet18 = resnet18(weights='DEFAULT').to(device).eval()
compressed_resnet18 = compressed_resnet18.to(device).eval()

In [None]:
%%timeit -r 10 -n 100
with torch.cuda.amp.autocast():
    uncompressed_resnet18(test_tensor)

In [None]:
%%timeit -r 10 -n 100
with torch.cuda.amp.autocast():
    compressed_resnet18(test_tensor)

In [None]:
stats_uncompressed = FlopCo(uncompressed_resnet18, device=device)
stats_compressed = FlopCo(compressed_resnet18, device=device)

In [None]:
print("FLOPS")
print(f"{stats_uncompressed.total_flops/10e6}e6; {stats_compressed.total_flops/10e6}e6")
print("\nParameters")
print(f"{stats_uncompressed.total_params}; {stats_compressed.total_params}")

In [None]:
dict(sorted(stats_uncompressed.relative_params.items(), key=lambda item: item[1], reverse=True))

In [None]:
dict(sorted(stats_compressed.relative_params.items(), key=lambda item: item[1], reverse=True))

## Try to finetune model

In [None]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

torch.cuda.empty_cache()

# Instantiate models
reference_model = uncompressed_resnet18.train()
trainable_model = compressed_resnet18

# Loss function: CrossEntropyLoss
criterion = nn.CrossEntropyLoss()

# Optimizer for the trainable model
optimizer = optim.Adam(trainable_model.parameters(), lr=0.0001)

# Training loop without a dataset
epochs = 90
batch_size = 256
input_shape = (3, 224, 224)  # Image-like input (channels, height, width)
batches = 100

for epoch in range(epochs):
    running_loss = 0.0
    # Use tqdm to track progress
    with tqdm(total=batches, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
        for _ in range(batches):  # Simulate 100 batches per epoch
            # Generate random input data (like random images)
            inputs = torch.randn(batch_size, *input_shape, device=device)

            # Get the reference model's output (detach to avoid backprop on reference model)
            with torch.cuda.amp.autocast():
                with torch.no_grad():
                    reference_outputs = reference_model(inputs)

                # Forward pass of the trainable model
                model_outputs = trainable_model(inputs)

            # Compute the loss between the two model outputs
            # We need to find the class with maximum probability in the reference outputs for CrossEntropy
            # Assuming reference outputs are logits (not probabilities)
            _, target_indices = torch.max(reference_outputs, dim=1)  # Get class indices for the targets

            # Compute the Cross-Entropy loss
            loss = criterion(model_outputs, target_indices)

            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Update the progress bar
            pbar.set_postfix(loss=loss.item())
            pbar.update(1)  # Increment the progress bar by 1

    # Print the average loss for the epoch
    avg_loss = running_loss / batches
    print(f"Epoch [{epoch + 1}/{epochs}], Average Loss: {avg_loss:.4f}")

print("Training complete!")