# Network Compression using SVD

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
#Loading MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
train_x = mnist.train.images #Training Images
train_y = mnist.train.labels # Training Labels
test_x = mnist.test.images #Test Images
test_y = mnist.test.labels #Test Labels

## Placeholders
Creating place holders for input data and output labels. I used mini-batches for gradient descent, as the mini-batch size is a variable, number of input samples is specified as None in the place holders.

In [4]:
X = tf.placeholder(tf.float32, shape=[None, 784])
Y = tf.placeholder(tf.float32, shape=[None, 10])

## Parameters
Weights and bias parameters for 5 hidden layers with 1024 units each and one output layer which is a softmax layer with 10 units.

In [5]:
W1 = tf.get_variable("W1", shape=[784, 1024], initializer=tf.keras.initializers.he_normal(seed=1))
b1 = tf.get_variable("b1", shape=[1024], initializer=tf.initializers.zeros)
W2 = tf.get_variable("W2", shape=[1024, 1024], initializer=tf.keras.initializers.he_normal(seed=1))
b2 = tf.get_variable("b2", shape=[1024], initializer=tf.initializers.zeros)
W3 = tf.get_variable("W3", shape=[1024, 1024], initializer=tf.keras.initializers.he_normal(seed=1))
b3 = tf.get_variable("b3", shape=[1024], initializer=tf.initializers.zeros)
W4 = tf.get_variable("W4", shape=[1024, 1024], initializer=tf.keras.initializers.he_normal(seed=1))
b4 = tf.get_variable("b4", shape=[1024], initializer=tf.initializers.zeros)
W5 = tf.get_variable("W5", shape=[1024, 1024], initializer=tf.keras.initializers.he_normal(seed=1))
b5 = tf.get_variable("b5", shape=[1024], initializer=tf.initializers.zeros)
W6 = tf.get_variable("W6", shape=[1024, 10], initializer=tf.keras.initializers.he_normal(seed=1))
b6 = tf.get_variable("b6", shape=[10], initializer=tf.initializers.zeros)

## Model
- Neural network model with 5 hidden layers with ReLU activation (Z1/A1, Z2/A2, Z3/A3, Z4/A4, Z5/A5) and a softmax layer.
- Softmax crossentropy is used as cost function.
- Training Model using Adagrad with learning rate, $\alpha = 0.05$

In [6]:
Z1 = tf.add(tf.matmul(X, W1), b1) #num_samples x 1024
A1 = tf.nn.relu(Z1)
Z2 = tf.add(tf.matmul(A1, W2), b2) #num_samples x 1024
A2 = tf.nn.relu(Z2)
Z3 = tf.add(tf.matmul(A2, W3), b3) #num_samples x 1024
A3 = tf.nn.relu(Z3)
Z4 = tf.add(tf.matmul(A3, W4), b4) #num_samples x 1024
A4 = tf.nn.relu(Z4)
Z5 = tf.add(tf.matmul(A4, W5), b5) #num_samples x 1024
A5 = tf.nn.relu(Z5)
Z6 = tf.add(tf.matmul(A5, W6), b6) #num_samples x 10

In [7]:
softmax_crossent = tf.nn.softmax_cross_entropy_with_logits_v2(logits = Z6, labels = Y)

In [8]:
cost = tf.reduce_mean(softmax_crossent)

In [9]:
#train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
train_step = tf.train.AdagradOptimizer(0.05).minimize(cost)

In [10]:
#Setting configurtion for tensorflow memory usage
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.33

#Creating a tendorflow session
sess=tf.Session(config=config)

## Executing Model
- Executing the neural network model that we built above. Using a mini-batch size of $100$ and running it for $200$ epochs.  
- For every epoch the training dataset is reshuffled, so that the examples in each mini-batches will be random from epoch to epoch.  
- Cost for each mini-batch in an epoch is divided by the number of batches, so that the total cost at the end of the epoch will be an average of all the mini-batch costs.
- Training accuracy = $100\%$
- Test accuracy = $98.37\%$

In [11]:
sess.run(tf.global_variables_initializer())
np.random.seed(1) #setting seed to reproduce same results
batch_size = 100
num_epochs = 200
num_batches = math.ceil(train_x.shape[0]/batch_size)
for epoch in range(num_epochs):
    epoch_cost = 0
    perm = np.random.permutation(train_x.shape[0])
    train_x = train_x[perm, :]
    train_y = train_y[perm, :]
    for i in range(num_batches):
        start = i*batch_size
        if i == num_batches-1:
            batch_ip = train_x[start:train_x.shape[0], :]
            batch_op = train_y[start:train_y.shape[0], :]
        else:
            batch_ip = train_x[start:start+batch_size, :]
            batch_op = train_y[start:start+batch_size, :]
        _, batch_cost = sess.run([train_step, cost], feed_dict={X:batch_ip, Y:batch_op})
        epoch_cost += batch_cost/num_batches
    if (epoch+1)%10 == 0:
        print("Cost after epoch %i: %f" % (epoch, epoch_cost))
correct_preds = tf.equal(tf.argmax(Z6,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
print("Train Accuracy:", accuracy.eval(session = sess, feed_dict={X: mnist.train.images, Y: mnist.train.labels}))
print("Test Accuracy:", accuracy.eval(session=sess, feed_dict={X: mnist.test.images, Y: mnist.test.labels}))

Cost after epoch 9: 0.000457
Cost after epoch 19: 0.000064
Cost after epoch 29: 0.000035
Cost after epoch 39: 0.000024
Cost after epoch 49: 0.000018
Cost after epoch 59: 0.000015
Cost after epoch 69: 0.000012
Cost after epoch 79: 0.000011
Cost after epoch 89: 0.000009
Cost after epoch 99: 0.000008
Cost after epoch 109: 0.000007
Cost after epoch 119: 0.000007
Cost after epoch 129: 0.000006
Cost after epoch 139: 0.000006
Cost after epoch 149: 0.000005
Cost after epoch 159: 0.000005
Cost after epoch 169: 0.000004
Cost after epoch 179: 0.000004
Cost after epoch 189: 0.000004
Cost after epoch 199: 0.000004
Train Accuracy: 1.0
Test Accuracy: 0.9837


In [12]:
#Saving weights into temporary variables
W1_temp = W1
W2_temp = W2
W3_temp = W3
W4_temp = W4
W5_temp = W5

## SVD
Applying svd on each weight matrix for first 5 hidden layers and leacing the weights for softmax layer as it is. The (s,u,v) matrices obtained from svd are converted to numpy arrays.

In [13]:
s11, u11, v11 = tf.svd(W1)
s21, u21, v21 = tf.svd(W2)
s31, u31, v31 = tf.svd(W3)
s41, u41, v41 = tf.svd(W4)
s51, u51, v51 = tf.svd(W5)

#Converting to numpy arrays
s1 = s11.eval(session=sess)
u1 = u11.eval(session=sess)
v1 = v11.eval(session=sess)
s2 = s21.eval(session=sess)
u2 = u21.eval(session=sess)
v2 = v21.eval(session=sess)
s3 = s31.eval(session=sess)
u3 = u31.eval(session=sess)
v3 = v31.eval(session=sess)
s4 = s41.eval(session=sess)
u4 = u41.eval(session=sess)
v4 = v41.eval(session=sess)
s5 = s51.eval(session=sess)
u5 = u51.eval(session=sess)
v5 = v51.eval(session=sess)

In [14]:
print(u1.shape, " -- ", s1.shape, " -- ", v1.T.shape)
print(u2.shape, " -- ", s2.shape, " -- ", v2.T.shape)
print(u3.shape, " -- ", s3.shape, " -- ", v3.T.shape)
print(u4.shape, " -- ", s4.shape, " -- ", v4.T.shape)
print(u5.shape, " -- ", s5.shape, " -- ", v5.T.shape)

(784, 784)  --  (784,)  --  (784, 1024)
(1024, 1024)  --  (1024,)  --  (1024, 1024)
(1024, 1024)  --  (1024,)  --  (1024, 1024)
(1024, 1024)  --  (1024,)  --  (1024, 1024)
(1024, 1024)  --  (1024,)  --  (1024, 1024)


In [15]:
%%html
<style>
table {float:left}
</style>

## Evaluating Compressed Weights
- Computing the low rank weight matrices for each D value.
- Replace the new weights in the graph and compute train and test accuracies

|D|Train Acc|Test Acc|
|:---:|---:|---:|
|$10$|$68.15\%$|$67.79\%$|
|$20$|$80.78\%$|$80.14\%$|
|$50$|$89.03\%$|$88.63\%$|
|$100$|$93.00\%$|$92.65\%$|
|$200$|$96.70\%$|$95.92\%$|
|$Full$|$100.00\%$|$98.37\%$|

In [16]:
D_arr = [10,20,50,100,200,'FULL']

In [17]:
for D in D_arr:
    if D == 'FULL':
        W1_hat = tf.matmul(tf.matmul(u1,tf.diag(s1)),tf.transpose(v1))
        W2_hat = tf.matmul(tf.matmul(u2,tf.diag(s2)),tf.transpose(v2))
        W3_hat = tf.matmul(tf.matmul(u3,tf.diag(s3)),tf.transpose(v3))
        W4_hat = tf.matmul(tf.matmul(u4,tf.diag(s4)),tf.transpose(v4))
        W5_hat = tf.matmul(tf.matmul(u5,tf.diag(s5)),tf.transpose(v5))
    else:
        W1_hat = tf.matmul(tf.matmul(u1[:,0:D],tf.diag(s1[0:D])),tf.transpose(v1[:,0:D]))
        W2_hat = tf.matmul(tf.matmul(u2[:,0:D],tf.diag(s2[0:D])),tf.transpose(v2[:,0:D]))
        W3_hat = tf.matmul(tf.matmul(u3[:,0:D],tf.diag(s3[0:D])),tf.transpose(v3[:,0:D]))
        W4_hat = tf.matmul(tf.matmul(u4[:,0:D],tf.diag(s4[0:D])),tf.transpose(v4[:,0:D]))
        W5_hat = tf.matmul(tf.matmul(u5[:,0:D],tf.diag(s5[0:D])),tf.transpose(v5[:,0:D]))

    sess.run(tf.assign(sess.graph.get_tensor_by_name("W1:0"), W1_hat))
    sess.run(tf.assign(sess.graph.get_tensor_by_name('W2:0'), W2_hat))
    sess.run(tf.assign(sess.graph.get_tensor_by_name('W3:0'), W3_hat))
    sess.run(tf.assign(sess.graph.get_tensor_by_name('W4:0'), W4_hat))
    sess.run(tf.assign(sess.graph.get_tensor_by_name('W5:0'), W5_hat))
    print("For D = ", D)
    print("Train Accuracy:", accuracy.eval(session = sess, feed_dict={X: mnist.train.images, Y: mnist.train.labels}))
    start = time.time()
    print("Test Accuracy:", accuracy.eval(session=sess, feed_dict={X: mnist.test.images, Y: mnist.test.labels}))
    stop = time.time()
    print("Test exec time:", round(stop-start,4))
    print("=========================================")

For D =  10
Train Accuracy: 0.6815091
Test Accuracy: 0.6779
Test exec time: 0.0218
For D =  20
Train Accuracy: 0.80785453
Test Accuracy: 0.8014
Test exec time: 0.014
For D =  50
Train Accuracy: 0.8903818
Test Accuracy: 0.8863
Test exec time: 0.0137
For D =  100
Train Accuracy: 0.9300909
Test Accuracy: 0.9265
Test exec time: 0.0137
For D =  200
Train Accuracy: 0.96703637
Test Accuracy: 0.9592
Test exec time: 0.0139
For D =  FULL
Train Accuracy: 1.0
Test Accuracy: 0.9837
Test exec time: 0.0142


In [18]:
sess.close()

## Updated Network Parameters
I choose the approach mentioned in 6.a to bilud the new network. The parameters for first 5 layers now are U, V which are determined by the earlier svd calculations and fixing D=20, and bias is initialized with the corresponding bias values in earlier network. The parameters for softmax layer (W6, b6) are initialized with corresponding values in earlier network.

In [19]:
D = 20
U1_D = tf.get_variable("U1_D", initializer=u1[:,0:D])
V1_D = tf.get_variable("V1_D", initializer=np.matmul(np.diag(s1[0:D]), np.transpose(v1[:,0:D])))
b1_D = tf.get_variable("b1_D", initializer=b1)
U2_D = tf.get_variable("U2_D", initializer=u2[:,0:D])
V2_D = tf.get_variable("V2_D", initializer=np.matmul(np.diag(s2[0:D]), np.transpose(v2[:,0:D])))
b2_D = tf.get_variable("b2_D", initializer=b2)
U3_D = tf.get_variable("U3_D", initializer=u3[:,0:D])
V3_D = tf.get_variable("V3_D", initializer=np.matmul(np.diag(s3[0:D]), np.transpose(v3[:,0:D])))
b3_D = tf.get_variable("b3_D", initializer=b3)
U4_D = tf.get_variable("U4_D", initializer=u4[:,0:D])
V4_D = tf.get_variable("V4_D", initializer=np.matmul(np.diag(s4[0:D]), np.transpose(v4[:,0:D])))
b4_D = tf.get_variable("b4_D", initializer=b4)
U5_D = tf.get_variable("U5_D", initializer=u5[:,0:D])
V5_D = tf.get_variable("V5_D", initializer=np.matmul(np.diag(s5[0:D]), np.transpose(v5[:,0:D])))
b5_D = tf.get_variable("b5_D", initializer=b5)
W6_D = tf.get_variable("W6_D", initializer=W6)
b6_D = tf.get_variable("b6_D", initializer=b6)

## Model
- Neural network model with 10 hidden layers with ReLU activation after every second later output (U1,V1/A1, U2,V2/A2, U3,V3/A3, U4,V4/A4, U5,V5/A5) and a softmax layer.
- Softmax crossentropy is used as cost function.
- Training Model using Adagrad with learning rate, $\alpha = 0.05$

In [20]:
zu1 = tf.matmul(X, U1_D)
zv1 = tf.add(tf.matmul(zu1, V1_D), b1_D)
za1 = tf.nn.relu(zv1)
zu2 = tf.matmul(za1, U2_D)
zv2 = tf.add(tf.matmul(zu2, V2_D), b2_D)
za2 = tf.nn.relu(zv2)
zu3 = tf.matmul(za2, U3_D)
zv3 = tf.add(tf.matmul(zu3, V3_D), b3_D)
za3 = tf.nn.relu(zv3)
zu4 = tf.matmul(za3, U4_D)
zv4 = tf.add(tf.matmul(zu4, V4_D), b4_D)
za4 = tf.nn.relu(zv4)
zu5 = tf.matmul(za4, U5_D)
zv5 = tf.add(tf.matmul(zu5, V5_D), b5_D)
za5 = tf.nn.relu(zv5)
zw6 = tf.add(tf.matmul(za5, W6_D), b6_D)

In [21]:
softmax_crossent1 = tf.nn.softmax_cross_entropy_with_logits_v2(logits = zw6, labels = Y)
cost1 = tf.reduce_mean(softmax_crossent1)
train_step1 = tf.train.AdagradOptimizer(0.05).minimize(cost1)

In [22]:
#Creating a tensorflow session
sess=tf.Session(config=config)

## Executing Model
- Executing the neural network model that we built above. Using a mini-batch size of $100$ and running it for $450$ epochs.  
- For every epoch the training dataset is reshuffled, so that the examples in each mini-batches will be random from epoch to epoch.  
- Cost for each mini-batch in an epoch is divided by the number of batches, so that the total cost at the end of the epoch will be an average of all the mini-batch costs. Cost is printed once every 20 epochs.
- Training accuracy = $98.4\%$
- Test accuracy = $96.1\%$

In [23]:
sess.run(tf.global_variables_initializer())
np.random.seed(1) #setting seed to reproduce same results
num_epochs = 450
correct_preds1 = tf.equal(tf.argmax(zw6,1), tf.argmax(Y,1))
accuracy1 = tf.reduce_mean(tf.cast(correct_preds1, tf.float32))
print("Initial Train Accuracy:", accuracy1.eval(session = sess, feed_dict={X: mnist.train.images, Y: mnist.train.labels}))
print("Initial Test Accuracy:", accuracy1.eval(session=sess, feed_dict={X: mnist.test.images, Y: mnist.test.labels}))
print("============================================================================================")
for epoch in range(num_epochs):
    epoch_cost = 0
    perm = np.random.permutation(train_x.shape[0])
    train_x = train_x[perm, :]
    train_y = train_y[perm, :]
    for i in range(num_batches):
        #batch = mnist.train.next_batch(100)
        start = i*batch_size
        if i == num_batches-1:
            batch_ip = train_x[start:train_x.shape[0], :]
            batch_op = train_y[start:train_y.shape[0], :]
        else:
            batch_ip = train_x[start:start+batch_size, :]
            batch_op = train_y[start:start+batch_size, :]
        _, batch_cost = sess.run([train_step1, cost1], feed_dict={X:batch_ip, Y:batch_op})
        epoch_cost += batch_cost/num_batches
    if (epoch+1)%20 == 0:
        acc_tr = accuracy1.eval(session = sess, feed_dict={X: mnist.train.images, Y: mnist.train.labels})
        acc_test = accuracy1.eval(session=sess, feed_dict={X: mnist.test.images, Y: mnist.test.labels})
        print("Cost after epoch %i: %f" % (epoch, epoch_cost), ", Train Accuracy:", round(acc_tr,3), ", Test Accuracy:", round(acc_test,3))
print("Final Train Accuracy:", accuracy1.eval(session = sess, feed_dict={X: mnist.train.images, Y: mnist.train.labels}))
print("Final Test Accuracy:", accuracy1.eval(session=sess, feed_dict={X: mnist.test.images, Y: mnist.test.labels}))

Initial Train Accuracy: 0.78752726
Initial Test Accuracy: 0.7816
Cost after epoch 19: 0.507029 , Train Accuracy: 0.858 , Test Accuracy: 0.86
Cost after epoch 39: 0.323124 , Train Accuracy: 0.908 , Test Accuracy: 0.906
Cost after epoch 59: 0.244721 , Train Accuracy: 0.931 , Test Accuracy: 0.925
Cost after epoch 79: 0.204769 , Train Accuracy: 0.942 , Test Accuracy: 0.936
Cost after epoch 99: 0.180146 , Train Accuracy: 0.947 , Test Accuracy: 0.94
Cost after epoch 119: 0.162487 , Train Accuracy: 0.955 , Test Accuracy: 0.948
Cost after epoch 139: 0.147616 , Train Accuracy: 0.955 , Test Accuracy: 0.945
Cost after epoch 159: 0.137356 , Train Accuracy: 0.96 , Test Accuracy: 0.95
Cost after epoch 179: 0.128360 , Train Accuracy: 0.962 , Test Accuracy: 0.952
Cost after epoch 199: 0.119852 , Train Accuracy: 0.967 , Test Accuracy: 0.952
Cost after epoch 219: 0.112679 , Train Accuracy: 0.963 , Test Accuracy: 0.95
Cost after epoch 239: 0.106296 , Train Accuracy: 0.968 , Test Accuracy: 0.952
Cost afte

# Conclusion
- Total number of network parameters is reduced from 5 million to 0.215 million, without significantly affecting test error.
- Though it takes more time to train, it is a good trade-off to achieve significant reduction in network storage.