# **COMP5329 - Deep Learning Assignment 1**

**Semester 1, 2022**

**By:**

* Xu Deng
* Yingbin Mo
* Yiran Zhang


In [1]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

# **Setting the environment**

In [None]:
# load additional packages
import numpy as np
import copy
import math
from time import time
import matplotlib.pyplot as pl

In [None]:
# show running-config
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# **Loading the dataset**

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download train and test data
id='10V_IhE8xdaHh_P1poCXYARb2b8TTBUri'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train_data.npy')

id='12SvR08FPBzfrs4e3bpAP-OUW2jB9XMNb'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train_label.npy')

id='1Umbpm9oDYYZaEye8WYlfcnKSZJEAlRxV'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test_data.npy')

id='154-81EnE3cSNTJsW0PO1K6C6Z33NZvN2'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test_label.npy')

In [None]:
# Load train and test data
train_data = np.load("/content/train_data.npy")
train_label = np.load("/content/train_label.npy")

test_data = np.load("/content/test_data.npy")
test_label = np.load("/content/test_label.npy")

In [None]:
# Print the shape of train and test data
print("The train data shape is: {}. The train label data shape is: {}".format(train_data.shape, train_label.shape))
print("The test data shape is: {}. The test label data shape is: {}".format(test_data.shape, test_label.shape))

# Print the number and the form of labels 
train_y = [x[0] for x in list(train_label)] 
print("There are {} different labels in train data: {}".format(len(set(train_y)), set(train_y)))
test_y = [x[0] for x in list(test_label)] 
print("There are {} different labels in test data: {}".format(len(set(test_y)), set(test_y)))

The train data shape is: (50000, 128). The train label data shape is: (50000, 1)
The test data shape is: (10000, 128). The test label data shape is: (10000, 1)
There are 10 different labels in train data: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
There are 10 different labels in test data: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


# **Preprocessing data**

In [None]:
# # Normalize data to have zero mean and unit variance
# train_data = (train_data - train_data.mean(0)) / train_data.std(0)
# test_data = (test_data - test_data.mean(0)) / test_data.std(0)

# # Convert label to one-hot encoding
# train_label = np.eye(10)[train_label.reshape(-1)] #eye会累加，所以只能运行一次
# test_label = np.eye(10)[test_label.reshape(-1)]

# **Activation**

In [None]:
class Activation(object):
    #sigmoid
    def __sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))
    def __sigmoid_deriv(self, a):
        return  a * (1 - a )
        
    #tanh
    def __tanh(self, x):
        return np.tanh(x)
    def __tanh_deriv(self, a):
        return 1.0 - a**2    

    # relu
    def __relu(self, x):
        return np.maximum(0, x)
    def __relu_deriv(self, a):
        return np.where( a > 0, 1, 0)

    #softmax
    def __softmax(self, x):
        e_x = np.exp(x - np.max(x, axis = -1,keepdims = True))
        return e_x / np.sum(e_x, axis = -1,keepdims = True)

    def __init__(self, activation='relu'):
        if activation == 'sigmoid':
            self.f = self.__sigmoid
            self.f_deriv = self.__sigmoid_deriv
        elif activation == 'tanh':
            self.f = self.__tanh
            self.f_deriv = self.__tanh_deriv
        elif activation == 'relu':
            self.f = self.__relu
            self.f_deriv = self.__relu_deriv
        elif activation =='softmax':
            self.f = self.__softmax

# **Layer**

In [None]:
class Layer(object):
    
    def __init__(self, n_in, n_out, optimizer, activation = 'relu'):
        self.input = None
        self.linear_output = None
        self.output = None
        #定义激活函数及其导数
        self.activation = Activation(activation).f
        if activation == 'softmax': #因为结合cross entropy可以更快算出delta
            self.activation_deriv = None
        else:
            self.activation_deriv = Activation(activation).f_deriv

        # 初始化该层网络的权重和偏置       
        if activation == 'relu' or activation == 'leakyrelu': #kaiming for relu or leakyrelu
            self.W = np.random.uniform(
                low=-np.sqrt(6. / (n_in)), 
                high=np.sqrt(6. / (n_in)),
                size=(n_in, n_out))
        else: #Xavier for sigmoid or tanh
            self.W = np.random.uniform(
                low=-np.sqrt(6. / (n_in + n_out)), 
                high=np.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out))
        if activation == 'sigmoid':
            self.W *=4
        self.b = np.zeros(n_out,)

        # 初始化优化器 TODO
        self.opt_w = copy.copy(optimizer)
        self.opt_b = copy.copy(optimizer)
        
    def forward(self, input, train = True):
        self.input = input
        #计算输出
        self.linear_output = np.dot(input, self.W)+self.b
        self.output = self.activation(self.linear_output)
        return self.output
        
    def backward(self, delta):
        if not self.activation_deriv is None: #最后一层的delta无需求导即可得
            delta = delta *self.activation_deriv(self.linear_output)
        W_original = self.W
        # 计算梯度
        grad_W = np.dot(self.input.T, delta)
        grad_d = np.sum(delta, axis = 0 ,keepdims = True)
        # 更新权重和偏置#TODO
        self.W = self.opt_w.update(self.W, grad_W)
        self.b = self.opt_b.update(self.b, grad_d)
        # 更新梯度
        delta = np.dot(delta, W_original.T)    
        return delta
                

# **DropoutLayer**

In [None]:
class DropoutLayer(object):
    def __init__(self, ratio = 0.5):
        self.ratio = ratio
        self.mask = None
        
    def forward(self, X ,train = True):
        if train:
            self.mask = np.random.uniform(size = X.shape)>self.ratio
            return X * self.mask
        else:
            return X
            
    def backward(self, delta):
        return delta * self.mask

# **BNLayer**

In [None]:
class BNLayer(object):
    def __init__(self, gamma, beta, optimizer, momentum = 0.9):
        self.Xm = None
        self.Xv = None
        self.Xn =None
        self.bn_ga = gamma
        self.bn_be = beta
        
        self.opt = optimizer
        self.Mo = momentum
        self.opt_ga = copy.copy(optimizer)
        self.opt_be = copy.copy(optimizer)
        
    def forward(self, X ,train = True):
         # 初始化
        if self.Xm is None:
            self.Xm = np.mean (X, axis = 0)
            self.Xv = np.var (X, axis =0)
        if train: 
            m = np.mean (X, axis = 0)
            self.Xm = self.Mo * self.Xm + (1 - self.Mo) * m #指数移动平均
            v = np.var (X, axis = 0)
            self.Xv = self.Mo * self.Xv + (1 - self.Mo) * v
        else:
            m = self.Xm
            v = self.Xv
        #求出经过normalise后的值
        self.di = X - m
        eps_bn = np.finfo(float).eps
        self.Xs = np.sqrt(np.maximum(v, eps_bn))
        self.Xn = self.di / self.Xs
        output = self.bn_ga * self.Xn + self.bn_be
        return output
    
    def backward (self, delta):
        bn_ga_original = self.bn_ga
        bn_ga_grad = np.sum(delta * self.Xn, axis = 0)
        bn_be_grad = np.sum(delta, axis = 0)
        N, _ = delta.shape
        #更新ga和be
        self.bn_ga = self.opt_ga.update(self.bn_ga, bn_ga_grad)
        self.bn_be = self.opt_be.update(self.bn_be,bn_be_grad)
        eps_bn = np.finfo(float).eps
        dXn = delta * bn_ga_original
        dv1=pow((self.Xv + eps_bn),-3/2)
        dv2=np.sum(dXn * self.di, axis = 0) 
        dv = -(1/2)*dv1*dv2
        dm1=np.sum(dXn * (1/self.Xs), axis = 0) 
        dm2=dv * (1/N) * np.sum(-2 * self.di, axis = 0) 
        dm =  dm1+dm2
        #更新delta
        d1=dm/N
        d2=dv * 2 / N* self.di 
        d3=dXn * pow(self.Xs, -1)
        delta =d1+d2+d3
        return delta

# **Optimizer**

In [None]:
class Optimizer (object):
    def __init__(self, lr = 0.001, momentum = 0.9, rho=0.9, weight_decay = 1e-2, mode='M_S'):
        self.lr = lr
        self.Mo = momentum
        self.grad = None
        self.Eg = None # Running average of the square gradients at w
        self.rho = rho
        self.weight_decay = weight_decay
        self.mode = mode
        
    def update(self, w, delta):
        if self.mode is 'M_S':  #if momentum =0  ->SGD
            if self.grad is None:
                self.grad = np.zeros(w.shape)
            self.grad = self.Mo * self.grad + (1 - self.Mo) * delta
            w = w * (1- self.weight_decay) - self.lr * self.grad
            return w

        if self.mode is 'RMS':
            if self.Eg is None:
                self.Eg = np.zeros(w.shape)
            self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(delta, 2)
            w = w*(1- self.weight_decay) - self.lr *  delta / np.sqrt(self.Eg + np.finfo(float).eps)
            return  w

# **MLP**

In [None]:
class MLP (object) :
    def __init__(self, n_in, n_out, layer, 
                optimizer,
                activation, #['relu','relu','softmax"]
                BN = False,
                Dropout = False,
                dropout_ratio = None):
        self.layers = []
        self.activation = activation
        self.opt = optimizer
        self.lr = self.opt.lr
        self.n_out = n_out

        #添加第一层隐藏层
        self.layers.append(Layer(n_in, layer[0], optimizer, activation[0]))
        if Dropout:
            self.layers.append(DropoutLayer(dropout_ratio[0]))
        if BN:
            self.layers.append(BNLayer(np.ones((1, layer[0])), np.zeros((1,layer[0])),optimizer))
        #添加剩余的隐藏层
        for i in range(1,len(layer)):
            self.layers.append(Layer(layer[i-1], layer[i], optimizer, activation[i]))
            if Dropout:
                self.layers.append(DropoutLayer(dropout_ratio[i]))
            if BN:
                self.layers.append(BNLayer(np.ones((1, layer[i])), np.zeros((1,layer[i])),optimizer))
        #添加输出层 
        self.layers.append(Layer(layer[-1],n_out,optimizer,activation[-1]))
        
    def criterion_CEL (self, y, y_pred):
        y_pred = np.maximum(y_pred, np.finfo(float).eps)
        y_oh = np.eye(self.n_out)[y].reshape(-1,self.n_out)
        loss = -np.sum(np.multiply(y_oh, np.log(y_pred)))
        # 此处结合了softmax
        delta = y_pred - y_oh
        return loss, delta
    
    def forward(self, input, train = True):
        for layer in self.layers:
            output = layer.forward(input, train)
            input = output
        return output
    
    def backward (self, delta):
        for layer in reversed(self.layers):
            delta = layer.backward (delta)

    def val_predict (self, X, y) :
        y_pred_val = self.forward (X, train = False)
        y_pred_val=np.argmax(y_pred_val, axis = 1).reshape(-1,1) 
        loss_val, _ = self.criterion_CEL (y, y_pred_val)
        val_mean_loss = loss_val/X.shape[0]
        true_num = np.sum( y_pred_val== y, axis = 0)
        val_acc = float(true_num / X.shape[0])
        return val_mean_loss, val_acc
    
    def test_predict (self, X, y) :
        y_pred_test = self.forward (X, train = False)
        y_pred_test = np.argmax(y_pred_test, axis = 1).reshape(-1,1) 
        loss_test, _ = self.criterion_CEL (y, y_pred_test)
        test_mean_loss = loss_test/X.shape[0]
        true_num = np.sum( y_pred_test== y, axis = 0)
        test_acc = float(true_num / X.shape[0])
        print(f'Test_loss: {test_mean_loss:.4f}\tTest_acc: {(test_acc*100):.2f}%')
        

    def fit(self, X, y, epochs = 100, batch_size = 100):
        train_loss_list = []
        train_acc_list = []
        val_loss_list = []
        val_acc_list = []
        for epoch in range (epochs) :
            #改变学习率
            # if epoch == int(epochs*1/3):
            #     self.lr = self.lr/5
            #     self.opt = Optimizer(lr = self.lr)
            # elif epoch == int(epochs*2/3):
            #     self.lr = self.lr/5
            #     self.opt = Optimizer(lr = self.lr)
            shuffle = np.arange(X.shape[0])
            np.random.shuffle(shuffle)
            X_ran = X[shuffle]
            y_ran = y[shuffle]
            ins = int(0.8*len(X_ran))
            X_train = X_ran[ :ins]
            y_train  =y_ran[ :ins]
            X_val = X_ran[ins: ]
            y_val = y_ran [ins: ]
                
            train_loss_one_epoch= 0
            y_pred_one_epoch = []

            iteration = X_train.shape[0] // batch_size
            begin = time()
            for it in range(iteration):
                start = it * batch_size
                stop = min((it+1) * batch_size, len(X_train))
                X_batch = X_ran[start : stop]
                y_batch = y_ran[start : stop]

                # 向前传播
                y_pred = self.forward(X_batch)#y_pred.shape 1000x10

                # 计算损失梯度
                loss, delta = self.criterion_CEL(y_batch, y_pred)

                #反向传播
                self.backward(delta) 
                train_loss_one_epoch += loss#float 累加      
                y_pred_one_epoch.extend(y_pred) #list 堆叠

            train_mean_loss = train_loss_one_epoch / len(X_ran)
            train_loss_list.append(train_mean_loss) #含有epoch个loss的list

            y_pred_one_epoch = np.array(y_pred_one_epoch) #y_pred_one_epoch 40000x10
            y_pred = y_pred_one_epoch.argmax(1).reshape (-1, 1) #y_pred 40000x1
            z=0
            for k in range(len(y_pred)):
                if y_pred[k] == y_train[k]:
                    z +=1
            train_acc = z/ len(X_ran)
            train_acc_list.append(train_acc)

            val_mean_loss, val_acc = self.val_predict(X_val, y_val)
            val_loss_list.append(val_mean_loss)
            val_acc_list.append(val_acc)

                                                        
            print (f"Epoch: {(epoch+1):02d} ->\tTrain_loss: {train_loss_list[-1]:.4f}\tTrain_acc: {(train_acc_list[-1]*100):.2f}%\t丨\tVal_loss: {val_loss_list[-1]:.4f}\tVal_acc: {(val_acc_list[-1]*100):.2f}%" )
        
        return train_loss_list, train_acc_list, val_loss_list, val_acc_list

# **Training**

In [None]:
optimizer = Optimizer(lr = 0.001, momentum = 0.8, rho=0.9, weight_decay = 1e-3, mode='RMS')
n_in = train_data.shape[1]
n_out = len(np.unique(train_label))
layer = [256,512]
activation = ['relu','relu','softmax']
model = MLP(n_in, n_out, layer, # [128, 256, 512]
                optimizer,
                activation, #['relu','relu','softmax"]
                BN = True,
                Dropout = True,
                dropout_ratio =[0.2,0.2])

train_loss_list, train_acc_list, val_loss_list, val_acc_list = model.fit (train_data, train_label, epochs =20, batch_size = 1000)

Epoch: 01 ->	Train_loss: 1.6535	Train_acc: 24.01%	丨	Val_loss: 2.7039	Val_acc: 41.12%
Epoch: 02 ->	Train_loss: 1.4064	Train_acc: 30.57%	丨	Val_loss: 2.9759	Val_acc: 44.36%
Epoch: 03 ->	Train_loss: 1.3165	Train_acc: 33.40%	丨	Val_loss: 2.4578	Val_acc: 47.83%
Epoch: 04 ->	Train_loss: 1.2598	Train_acc: 35.60%	丨	Val_loss: 2.4899	Val_acc: 48.72%
Epoch: 05 ->	Train_loss: 1.2171	Train_acc: 36.72%	丨	Val_loss: 3.4820	Val_acc: 51.02%
Epoch: 06 ->	Train_loss: 1.1809	Train_acc: 37.96%	丨	Val_loss: 2.8715	Val_acc: 51.29%
Epoch: 07 ->	Train_loss: 1.1505	Train_acc: 39.11%	丨	Val_loss: 2.6134	Val_acc: 52.59%
Epoch: 08 ->	Train_loss: 1.1277	Train_acc: 39.95%	丨	Val_loss: 2.1749	Val_acc: 54.35%
Epoch: 09 ->	Train_loss: 1.1034	Train_acc: 40.75%	丨	Val_loss: 2.6101	Val_acc: 54.46%
Epoch: 10 ->	Train_loss: 1.0820	Train_acc: 41.38%	丨	Val_loss: 2.3592	Val_acc: 55.40%
Epoch: 11 ->	Train_loss: 1.0659	Train_acc: 42.10%	丨	Val_loss: 3.0318	Val_acc: 56.40%
Epoch: 12 ->	Train_loss: 1.0436	Train_acc: 42.86%	丨	Val_loss: 2.6

In [None]:
model.test_predict(test_data, test_label)

Test_loss: 3.1517	Test_acc: 55.21%
