In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable

In [2]:
#loading dataset
train= dsets.MNIST(root='./data', train=True, transform=transforms.ToTensor(),download=True)
test= dsets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

In [3]:
#specifying epoches, no: of samples and iterations
n=5000
nsamples=90
epochs = int(n/(len(train)/nsamples))
val_test= torch.utils.data.DataLoader(dataset=test,batch_size=nsamples,shuffle=False)
val_train= torch.utils.data.DataLoader(dataset=train,batch_size=nsamples,shuffle=True)

**A default model class where the weight matrix relating first and second hidden layers are not compressed**

In [4]:
#constructing model class 1
#in which the weight matrix relating hidden layers are not compressed

class MLPerceptronNNModel(nn.Module):
    def __init__(self, input, hidden, output):
        super(MLPerceptronNNModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input, hidden),  #1st linear function
            nn.ReLU(),                           #1-N.Linear
            nn.Linear(hidden, hidden), #2nd linear function
            nn.ReLU(),                           #2-N.Linear
            nn.Linear(hidden, output)  #final lin function
        )

    def forward(self, x):
        return self.model(x)

**A model class where the weight matrix relating first and second hidden layers are compressed by the low-rank factorization algorithm SVD**

In [5]:



class CompressedMLPerceptronNNModel(nn.Module):
    def __init__(self, input, hidden, output, rank):
        super(CompressedMLPerceptronNNModel, self).__init__()
        self.rank= rank

        #1st linear function,
        #28*28->1000
        self.fc1= nn.Linear(input, hidden)
        self.relu1= nn.ReLU() # Non-linear 1

        #extracting the weight matrix of self.fc2
        self.fc2_weight = nn.Parameter(torch.Tensor(hidden, hidden))
        nn.init.xavier_uniform_(self.fc2_weight)  # initializing the weight matrix

        #performing SVD
        u,s,vt = torch.svd(self.fc2_weight)

        #truncating singular values to get desired rank
        u= u[:, :rank]
        s= torch.diag(s[:rank])
        vt= vt[:rank, :]

        #reconstructing the compressed weight matrix
        self.fc2_compressed = nn.Parameter(torch.matmul(torch.matmul(u, s), vt))

        #non-linearity 2
        self.relu2 = nn.ReLU()

        #lin function 3 (output layer):
        #1000->digits(10)
        self.fc3 = nn.Linear(hidden,output)

    def forward(self, x):
        #1st function
        out=self.relu1(self.fc1(x))

        #compressed 2nd function
        out=torch.matmul(out, self.fc2_compressed)
        out=self.relu2(out)

        #3rd function (output)
        out=self.fc3(out)
        return out


**A model class where the weight matrix relating first and second hidden layers are compressed using Randomized SVD introduced in "Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions" by Halko Et al.**

In [6]:

import numpy as np
from numpy import random, linalg

#RANDOMIZED RSVD TO FOR COMPRESSING NN HIDDEN LAYER WEIGHT MATRIX
#inputs: derired rank approximation, sampling param for gaussian approximation, no: of power iterations
#return: if K~ U*S*Vt, return U, S, Vt
def randsvd(K, rank, param_oversampling=None, n_power_iter=None,  return_range=False):

    m,n = K.shape
    n_samples= 2*rank

    #Pt 1: randomized range finder
    Q = find_range(K, n_samples, n_power_iter)

    #Pt 2: performing required rsvd
    B = np.matmul(Q.T, K)
    U_tilde, S, Vt = np.linalg.svd(B)
    U = np.matmul(Q, U_tilde)

    #truncating to desired rank
    U,S,Vt = U[:, :rank], S[:rank], Vt[:rank, :]

    if return_range:
        return U,S,Vt, Q
    return U,S,Vt

#RANDOMIZED RANGE FINDER FOR RSVD TO COMPUTE AN ORTHONORMAL MATRIX APPROXIMATING THE RANGE OF A
#inputs: Matrix K, no: of random gaussian samples, no:of iterations
def find_range(K, n_samples, n_iters=None):
    m, n = K.shape
    O = np.random.randn(n, n_samples)
    Y = np.matmul(K, O)

    if n_iters:
        return sub_iter(K, Y, n_iters)
    else:
        return OB(Y)

#RANDOMIZED SUBSPACE ITERATION FOR AN APPROXIMATE RANGE OF A FROM POWER ITERATIONS
#input: K(m*n), some init range of K, no: of iterations
#return: approximate range of K from power iterations
def sub_iter(A, Y0, n_iters):
    Q = OB(Y0)
    for i in range(n_iters):
        Z=OB(np.matmul(A.T, Q))
        Q=OB(np.matmul(A, Z))
    return Q


#FUNCTION TO COMPUTE ORTHONOMAL BASIS FOR A MATRIX
#input: a matrix K (m*n)
#output: orthonormal basis for M
def OB(K):
    Q, c = np.linalg.qr(K)
    return Q



class CompressedMLPerceptronNNModel2(nn.Module):
    def __init__(self, input, hidden, output, rank):
        super(CompressedMLPerceptronNNModel2, self).__init__()
        self.rank = rank

        #1st function
        #28*28-> 1000
        self.fc1 = nn.Linear(input, hidden)
        self.relu1 = nn.ReLU()

        #compressed linear function 2
        #1000-> 1000
        self.fc2_weight = nn.Parameter(torch.Tensor(hidden, hidden))
        nn.init.xavier_uniform_(self.fc2_weight)  #initializing the weight matrix
        self.fc2_compressed = self.compute_compressed_weight(self.fc2_weight, rank)

        #2nd func non-linear pt.
        self.relu2 = nn.ReLU()

        #function 3
        #1000-> 10
        self.fc3 = nn.Linear(hidden, output)

    def forward(self, x):
        #function 1 (lin and non-linear pt)
        out = self.relu1(self.fc1(x))

        #compressed linear function 2
        out = torch.matmul(out, self.fc2_compressed)
        #non-linear 2
        out = self.relu2(out)

        #linear function 3 (output)
        out = self.fc3(out)
        return out

    def compute_compressed_weight(self, weight, rank):

        #we convert weight matrix to a np array
        weight_np = weight.detach().numpy()

        #performing randomized SVD
        U,S,Vt = randsvd(weight_np, rank)

        #truncate to given rank
        U_truncated= U[:, :rank]
        S_truncated= S[:rank]
        Vt_truncated= Vt[:rank, :]

        #reconstructing the compressed weight matrix
        compressed_weight=np.dot(U_truncated, np.dot(np.diag(S_truncated), Vt_truncated))

        #converting back to torch tensor
        compressed_weight=torch.tensor(compressed_weight, dtype=torch.float32)

        return nn.Parameter(compressed_weight)



**Instantiating all three models**

In [7]:
#instantiating models

model1=MLPerceptronNNModel(input=784, hidden=1000, output=10)
model2= CompressedMLPerceptronNNModel(input=784, hidden=1000, output=10, rank=500)
model3= CompressedMLPerceptronNNModel2(input=784, hidden=1000, output=10, rank=500)



**Instantiating loss and optimizer classes and training the model**

In [12]:

criterion=nn.CrossEntropyLoss()

def train_model(model, learning_rate):
 optimizer=torch.optim.SGD(model.parameters(), lr=learning_rate)
 j = 0
 for epoch in range(epochs):

    for i, (images,labels) in enumerate(val_train):

        images= Variable(images.view(-1, 28*28))       #converting images to var
        labels= Variable(labels)
        optimizer.zero_grad()                          #clearing gradients w.r.t. parameters
        outputs=model(images)                          #Forward pass to get output
        loss = criterion(outputs, labels)              #calculating Loss: softmax -> cross entropy loss
        loss.backward()                                #getting gradients w.r.t. parameters
        optimizer.step()                               #updating the params


        j+=1

        if j%500==0:

            #calculating accuarcy

            right=0
            total=0

            #iterating through test dataset
            for images, labels in val_test:
                #Load images to a Torch Variable
                images = Variable(images.view(-1, 28*28))

                #forward pass only to get output
                outputs = model(images)

                #Getting predictions from the maximum value
                b, predicted = torch.max(outputs.data, 1)

                #total number of labels
                total+= labels.size(0)

                #correct predictions
                right += torch.eq(predicted, labels).sum().item()


            acc=100*(right/total)

            #LOSS
            print('No: of iterations: {}. Loss: {}. Accuracy: {}%'.format(j, loss.item(), acc))



**Training the first model without compression at a learning rate of 0.25**



In [13]:
train_model(model1, 0.25)

No: of iterations: 500. Loss: 0.0020182463340461254. Accuracy: 98.37%
No: of iterations: 1000. Loss: 0.00025715676019899547. Accuracy: 98.4%
No: of iterations: 1500. Loss: 0.0002025912399403751. Accuracy: 98.41%
No: of iterations: 2000. Loss: 0.0008019665838219225. Accuracy: 98.46000000000001%
No: of iterations: 2500. Loss: 0.0014233395922929049. Accuracy: 98.45%
No: of iterations: 3000. Loss: 0.0005694674327969551. Accuracy: 98.41%
No: of iterations: 3500. Loss: 0.00014881686365697533. Accuracy: 98.34%
No: of iterations: 4000. Loss: 0.000434714398579672. Accuracy: 98.41%
No: of iterations: 4500. Loss: 0.0006615446181967854. Accuracy: 98.44000000000001%


**Training the second model with the 1000*1000 weight matrix compressed using naive SVD, at a learning rate of 0.25**


In [14]:
train_model(model2, 0.25)

No: of iterations: 500. Loss: 0.17118841409683228. Accuracy: 94.77%
No: of iterations: 1000. Loss: 0.057777728885412216. Accuracy: 96.78%
No: of iterations: 1500. Loss: 0.046327266842126846. Accuracy: 97.49%
No: of iterations: 2000. Loss: 0.0688701719045639. Accuracy: 97.39%
No: of iterations: 2500. Loss: 0.04349192976951599. Accuracy: 97.82%
No: of iterations: 3000. Loss: 0.032681211829185486. Accuracy: 97.76%
No: of iterations: 3500. Loss: 0.009368539787828922. Accuracy: 97.88%
No: of iterations: 4000. Loss: 0.04232028126716614. Accuracy: 98.19%
No: of iterations: 4500. Loss: 0.010601863265037537. Accuracy: 97.99%


**Training the third model with the 1000*1000 weight matrix compressed using randomized SVD, at a learning rate of 0.25**


In [15]:
train_model(model3, 0.25)

No: of iterations: 500. Loss: 0.13663215935230255. Accuracy: 95.15%
No: of iterations: 1000. Loss: 0.17118017375469208. Accuracy: 96.94%
No: of iterations: 1500. Loss: 0.10826201736927032. Accuracy: 97.6%
No: of iterations: 2000. Loss: 0.1346370428800583. Accuracy: 97.69%
No: of iterations: 2500. Loss: 0.03268709406256676. Accuracy: 97.61%
No: of iterations: 3000. Loss: 0.04211420938372612. Accuracy: 97.88%
No: of iterations: 3500. Loss: 0.01729210466146469. Accuracy: 98.15%
No: of iterations: 4000. Loss: 0.03451615199446678. Accuracy: 97.98%
No: of iterations: 4500. Loss: 0.017224203795194626. Accuracy: 98.11%


**COMMENTS ON PERFORMANCE**

### Model 1 (Non-compressed):
- **Loss**: We see the loss decreasing steadily over iterations, so the model is learning effectively.
- **Accuracy**: The accuracy is always around 98%, hinting strong performance on the test dataset.

### Model 2 (Compressed using Naive SVD):
- **Loss**: The loss starts higher compared to the first model but decreases over iterations, although with some fluctuations.
- **Accuracy**: Accuracy starts lower compared to Model 1, but it improves steadily over iterations.
- **Comparison**: Model 2 has slightly lower accuracy compared to Model 1, probably from loss of performance due to compression. However, the relatively high accuracy implies successful compression.

### Model 3 (Compressed using Randomized SVD):
- **Loss**: The loss starts relatively high and fluctuates over iterations but generally decreases over time.
- **Accuracy**: Similar to Model 2, the accuracy starts lower compared to Model 1 but improves over iterations (but with some fluctuations).
- **Comparison**: Model 3 is similar to Model 2 in terms of loss and accuracy. But it achieves slightly higher accuracy compared to Model 2, so the randomized SVD might be a slightly better compression technique for this network.
