# Experimenting with low-rank model compression

In [1]:
import os
import torch
import torchvision
from torchvision.transforms import v2
from torchvision import transforms

## Neuron clustering using cosine similarity

In [11]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

def compute_centroids(weights, assignment):
    # we are going to mean the neurons into the first index in the weights occuring in the assingment
    first_indices = []
    for i in range(int(assignment.max()) + 1):
        indices = (assignment == i).nonzero()

        first_index = indices[0]
        
        try:
            first_indices.append(first_index.item())
            weights[first_index, :] = weights[indices].mean(0)
        except:
            first_indices.append(first_index[0].item())
            weights[first_index[0], :] = weights[indices[0]].mean(0)
    first_indices.sort()

    return weights[first_indices]


def reduce_neurons(weight, bias=None, clusters=None, threshold=0.1):
    # function that does the neuron clustering - returns new weights and biases of reduced neurons layer
    if bias is None:
        bias = torch.zeros((weight.shape[0]))

    
    weight = torch.concat((weight, bias.unsqueeze(-1)), 1)

    normed = torch.nn.functional.normalize(weight)

    D = (1.0 - (normed @ normed.T)).relu()

    C = AgglomerativeClustering(clusters, affinity='precomputed', linkage='complete', compute_full_tree=True, distance_threshold=threshold)
    assignment = C.fit_predict(D)

    centroids = compute_centroids(weight, assignment)

    bias, centroids = centroids[:, -1].squeeze(), centroids[:, :-1]

    return centroids, bias, assignment


def reduce_columns(weight, assignment):
    # function that compensates for neurons that were clustered in the previous layer by aggregating the input features
    # we are going to sum the columns into the first index in the weights occuring in the assignment
    first_indices = []
    for i in range(int(assignment.max())+1):
        indices = (assignment == i).nonzero()

        first_index = indices[0]

        try:
            first_indices.append(first_index.item())
            weight[:, first_index] = weight[:, indices].sum(1)
        except:
            first_indices.append(first_index[0].item())
            weight[:, first_index[0]] = weight[:, indices[0]].sum(1)

    first_indices.sort()

    return weight[:, first_indices]


### boring evaluation code...

In [13]:
from tqdm import tqdm
import time
from copy import deepcopy as copy

def train(model, optimizer, loader):
    model.train()
    loss = torch.nn.CrossEntropyLoss()

    for i, (X, y) in tqdm(enumerate(loader)):
        out = model(X.to(0))
        optimizer.zero_grad()
        l = loss(out, y.to(0))
        l.backward()
        optimizer.step()
        

def accuracy(output, target, topk=(1,)):
    output = output.to(torch.device('cpu'))
    target = target.to(torch.device('cpu'))
    maxk = max(topk)
    batch_size = target.shape[0]

    _, idx = output.sort(dim=1, descending=True)
    pred = idx.narrow(1, 0, maxk).t()
    correct = pred.eq(target.reshape(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(dim=0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def epoch_accuracy(loader_s, student):
    student.eval()

    out_epoch_s = [accuracy(student(L.to(0)), y)[0].detach().cpu().item() for L, y in loader_s]

    student.train()

    return sum(out_epoch_s) / len(out_epoch_s)

def test(network, test_loader):
    network.eval().to(0)
    test_loss = 0
    correct = 0
    test_losses=[]
    with torch.no_grad():
        for data, target in test_loader:
            output = network(data.to(0))
            test_loss += torch.nn.CrossEntropyLoss()(output, target.to(0)).item()
            pred = output.data.max(1, keepdim=True)[1].cpu()
            correct += pred.eq(target.data.view_as(pred)).sum()
        test_loss /= len(test_loader.dataset)
        test_losses.append(test_loss)
        print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


def latency(f, x, trials = 100):
    total = 0.0
    for trial in range(trials):
        start = time.perf_counter()
        f(x)
        total += time.perf_counter() - start
    return total / trials


### boring training code with data and network definition...

In [5]:
batch_size_train = 128

train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.CIFAR100('./cifar100/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.CIFAR100('./cifar100/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                             ])),
  batch_size=1024, shuffle=True)

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(3072, 2048)
        self.fc2 = torch.nn.Linear(2048, 1024)
        self.fc3 = torch.nn.Linear(1024, 512)
        self.fc4 = torch.nn.Linear(512, 100)

    def forward(self, x):
        x = x
        x = torch.nn.Flatten()(x)
        return torch.nn.functional.relu(self.fc4(torch.nn.functional.relu(self.fc3(torch.nn.functional.relu(self.fc2(torch.nn.functional.relu(self.fc1(x))))))))

Files already downloaded and verified
Files already downloaded and verified


## SVD low-rank compression layer

In [6]:
import numpy as np

class LowRankLinear(torch.nn.Module):
    # takes in a linear layer and decomposes it into two low-rank linear layers
    def __init__(self, fc, rank):
        super(LowRankLinear, self).__init__()

        self.fc1 = torch.nn.Linear(fc.weight.shape[1], rank, bias = False)
        self.fc2 = torch.nn.Linear(rank, fc.weight.shape[0])
        
        weight1 = fc.weight

        self.fc2.bias = fc.bias

        W1 = weight1.cpu().detach().clone().numpy()

        U1, E1, V1 = np.linalg.svd(W1, False)

        rd1 = np.zeros((len(E1), len(E1)))

        for i, v in enumerate(E1):
            rd1[i, i] = v


        if fc.weight.shape[1] > fc.weight.shape[0]:
            # if the input dom of the fc is bigger than the output dim
            self.fc1.weight = torch.nn.parameter.Parameter(torch.tensor(rd1[:rank, :rank] @ V1[:rank, :]).to(fc.weight.device).type(fc.weight.dtype))
            self.fc2.weight = torch.nn.parameter.Parameter(torch.tensor(U1[:, :rank]).to(fc.weight.device).type(fc.weight.dtype))
        else:
            self.fc1.weight = torch.nn.parameter.Parameter(torch.tensor(V1[:rank, :]).to(fc.weight.device).type(fc.weight.dtype))
            self.fc2.weight = torch.nn.parameter.Parameter(torch.tensor(U1[:, :rank] @ rd1[:rank, :rank]).to(fc.weight.device).type(fc.weight.dtype))


    def forward(self, x):
        return self.fc2(self.fc1(x))

### boring model training...

In [7]:
import gc

network = Net().to(0)
optimizer = torch.optim.SGD(network.parameters(), lr=1e-2,
                      momentum=0.5)

for epoch in range(5):
    train(network.to(0), optimizer, train_loader)
    gc.collect()
    test(network, test_loader)

391it [00:17, 22.19it/s]



Test set: Avg. loss: 0.0045, Accuracy: 478/10000 (5%)



391it [00:11, 33.35it/s]



Test set: Avg. loss: 0.0045, Accuracy: 765/10000 (8%)



391it [00:11, 33.14it/s]



Test set: Avg. loss: 0.0044, Accuracy: 1011/10000 (10%)



391it [00:11, 33.55it/s]



Test set: Avg. loss: 0.0043, Accuracy: 1082/10000 (11%)



391it [00:11, 33.45it/s]



Test set: Avg. loss: 0.0043, Accuracy: 1122/10000 (11%)



## Evaluating the compressed models' latency (cpu) and accuracy
latency doesn't change so much on the gpu as it is not FLOPS-bound with such a small model

In [None]:
network.cpu()
compressed_net = copy(network)

def lowranklatency(module, rank):
    module = copy(module)
    for name, mod in module.named_modules():
        if isinstance(mod, torch.nn.Linear):
            module.add_module(name, LowRankLinear(mod, rank))

    print(latency(module.eval(), torch.ones(64, 3, 32, 32)))
    test(module, test_loader)

def clusteredneuronlatency(module, threshold):
    module = copy(module)
    for (name, mod), (name_next, mod_next) in zip(module.named_modules(), list(iter(module.named_modules()))[1:]):
        if isinstance(mod, torch.nn.Linear):
            weights, bias, assignment = reduce_neurons(mod.weight.detach().clone(), mod.bias.detach().clone(), threshold=threshold)
            cols = reduce_columns(mod_next.weight.detach().clone(), assignment)

            mod = torch.nn.Linear(weights.shape[1], weights.shape[0])
            mod.weight = torch.nn.Parameter(weights.detach().clone())
            mod.bias = torch.nn.Parameter(bias.detach().clone())

            mod_next = torch.nn.Linear(cols.shape[1], cols.shape[0])
            mod_next.weight = torch.nn.Parameter(cols.detach().clone())
            mod_next.bias = mod_next.bias

            module.add_module(name, mod)
            module.add_module(name_next, mod_next)

    print(latency(module.eval(), torch.ones(64, 3, 32, 32)))
    test(module, test_loader)


def clusteredlowrank(module, rank, threshold):
    module = copy(module)

    for (name, mod), (name_next, mod_next) in zip(module.named_modules(), list(iter(module.named_modules()))[1:]):
        if isinstance(mod, torch.nn.Linear):
            weights, bias, assignment = reduce_neurons(mod.weight.detach().clone(), mod.bias.detach().clone(), threshold=threshold)
            cols = reduce_columns(mod_next.weight.detach().clone(), assignment)

            mod = torch.nn.Linear(weights.shape[1], weights.shape[0])
            mod.weight = torch.nn.Parameter(weights.detach().clone())
            mod.bias = torch.nn.Parameter(bias.detach().clone())

            mod_next = torch.nn.Linear(cols.shape[1], cols.shape[0])
            mod_next.weight = torch.nn.Parameter(cols.detach().clone())
            mod_next.bias = mod_next.bias

            module.add_module(name, mod)
            module.add_module(name_next, mod_next)

    for name, mod in module.named_modules():
        if isinstance(mod, torch.nn.Linear):
            module.add_module(name, LowRankLinear(mod, rank))

    print(latency(module.eval(), torch.ones(64, 3, 32, 32)))
    test(module, test_loader)

for p in range(2, 8):
    lowranklatency(compressed_net, 2**p)

for d in range(0, 20, 1):
    clusteredneuronlatency(compressed_net, d / 20)

clusteredlowrank(compressed_net, 2, 1.0)

0.0005990189999931772

Test set: Avg. loss: 0.0046, Accuracy: 319/10000 (3%)

0.0006293620000008104

Test set: Avg. loss: 0.0045, Accuracy: 594/10000 (6%)

0.000699129000005314

Test set: Avg. loss: 0.0045, Accuracy: 631/10000 (6%)

0.0007874779999883686

Test set: Avg. loss: 0.0045, Accuracy: 694/10000 (7%)

0.0008414289999984703

Test set: Avg. loss: 0.0045, Accuracy: 815/10000 (8%)

0.001249190999996017

Test set: Avg. loss: 0.0044, Accuracy: 1028/10000 (10%)

0.005720809999996845

Test set: Avg. loss: 0.0043, Accuracy: 1016/10000 (10%)

0.005700563999998849

Test set: Avg. loss: 0.0043, Accuracy: 1033/10000 (10%)

0.005686595999995916

Test set: Avg. loss: 0.0043, Accuracy: 1040/10000 (10%)

0.005887784000002512

Test set: Avg. loss: 0.0043, Accuracy: 1037/10000 (10%)

0.005946083000003455

Test set: Avg. loss: 0.0043, Accuracy: 1049/10000 (10%)

0.005767096000006404

Test set: Avg. loss: 0.0043, Accuracy: 1046/10000 (10%)

0.006664970000003905

Test set: Avg. loss: 0.0043, Accurac