# Lab 3
-------------------------------------
Author: Kevin Paganini       
Date: 3/21/2023     
Description: In this lab I write the derivations for a forward pass through a network by hand. Then I write and implement the code for it. Lastly, I create a training algorithm that uses back propagation and stochastic gradient descent to find the correct weights for a linear transformation. 

In [2]:
import numpy as np
import torch

For our example here we will have four inputs --> d   
5 nodes in the first layer --> h_1    
3 nodes in the second layer --> h_2    
This should work for abritrary batch sizes, just need to change the bias code a little then.

In [3]:
# d = 4
# h_1 = 5
# h_2 = 3

x = np.array([[1, 2, 3, 4]]) # 1, d
b = np.array([[1, 1, 1, 1, 1]]) # 1, h_1
w = np.array([[1, 1, 1, 1, 1],
              [2, 2, 2, 2, 2],
              [3, 3, 3, 3, 3], # d, h_1
              [4, 4, 4, 4, 4],
              ])
m = np.array([[1, 1, 1],
              [2, 2, 2],
              [3, 3, 3], # h_1, h_2
              [1, 1, 1],
              [2, 2, 2]])
c = np.array([[1, 1, 1]]) # 1, h_2
y = np.array([[1, 0, 1]]) # scaler (target)

def relu_np(x):
    return np.maximum(0, x)

def softmax(x):
    exp_scores = np.exp(x)
    softmax_scores = exp_scores / np.sum(exp_scores)
    return softmax_scores

def mse_np(y, y_hat):
    assert y.shape == y_hat.shape, 'shapes must be same for mse'
    return np.sum((y - y_hat)**2) / y.shape[1]

def l2_reg_np(weights, lambda_reg=1):
    return lambda_reg * np.sum(weights**2)

linear_1 = np.dot(x, w) + b
print(f'Linear 1 shape: {linear_1.shape}')
print(f'Linear 1 out: {linear_1}')
relu_out = relu_np(linear_1)
print(f'Relu 1 shape: {relu_out.shape}')
print(f'Relu output: {relu_out}')
linear_2 = np.dot(relu_out, m) + c
print(f'Linear 2 shape: {linear_2.shape}')
print(f'Linear 2 output: {linear_2}')
softmax_out = softmax(linear_2)
print(f'softmax output: {softmax_out}')
print(f'Softmax shape: {softmax_out.shape}')
loss = mse_np(y, softmax_out)
print(f'Loss: {loss}')
regularization_1 = l2_reg_np(w)
print(f'regularization_1: {regularization_1}')
regularization_2 = l2_reg_np(m)
print(f'regularization_2: {regularization_2}')
sum_1 = regularization_1 + regularization_2
print(f'sum_1: {sum_1}')
sum_2 = sum_1 + loss
print(f'sum_2: {sum_2}')





Linear 1 shape: (1, 5)
Linear 1 out: [[31 31 31 31 31]]
Relu 1 shape: (1, 5)
Relu output: [[31 31 31 31 31]]
Linear 2 shape: (1, 3)
Linear 2 output: [[280 280 280]]
softmax output: [[0.33333333 0.33333333 0.33333333]]
Softmax shape: (1, 3)
Loss: 0.3333333333333333
regularization_1: 150
regularization_2: 57
sum_1: 207
sum_2: 207.33333333333334


## Selected hyperparameters   

hidden layers = 1    
nodes in each layer = [3]    
learning_rate (no batch) = 0.1    
learning _rate (batch) = 0.1    
regularization (for all) = 0.001   
epochs (no batch) = 10    
epochs (batch) = 100

## Creating Linear Transform Data

In [4]:
from client import create_linear_training_data

x = create_linear_training_data()
input_x = x[0]
target = x[1]
print(f'First Input: {input_x[:,0]}')
print(input_x.shape)
print(f'First Target: {target[:,0]}')

First Input: tensor([-0.2250,  0.9556])
torch.Size([2, 1000])
First Target: tensor([-0.9556, -0.2250])


## Back Propagation algorithm using one sample gradient descent

In [5]:
EPOCHS = 10


# input 1x2
# weights_1 = 2x3
# weights_2 = 3x2
# bias = 1x3
# hidden = 1x3
# output = 2
# linear: input@ weights = 1x3
# relu: 1x3

w_1 = torch.rand((2, 3)) * 0.1
w_2 = torch.rand((3, 2)) * 0.1
b_1 = torch.zeros((1, 3)) 
b_2 = torch.zeros((1, 2))
w_1.requires_grad = True
b_1.requires_grad = True
w_2.requires_grad = True
b_2.requires_grad = True

alpha = 0.1


def relu(x):
    return torch.max(torch.zeros_like(x), x)

def mse(y_true, y_pred):
    mse = torch.mean((y_true - y_pred)**2)
    return mse

def l2_reg(matrix, lambda_val=0):
    l2_norm = torch.sum(torch.square(matrix))
    l2_loss = lambda_val * l2_norm
    return l2_loss

for i in range(EPOCHS):
    for sample in range(input_x.shape[1]):
        x = input_x[:,sample]
        y = target[:,sample]
        
        linear_1 = x@w_1 + b_1
        relu_out = relu(linear_1)
        linear_2 = relu_out@w_2 + b_2
        loss = mse(y, linear_2)
        if sample % 500 == 0:
            print(f'Loss after forward pass: {loss}')
        reg_1 = l2_reg(w_1, 0.001)
        reg_2 = l2_reg(w_2, 0.001)
        sum_1 = reg_1 + reg_2
        objective = loss + sum_1
        objective.backward()
        
        # update params
        w_1.data -= alpha * w_1.grad
        w_1.grad.zero_()
        w_2.data -= alpha * w_2.grad
        w_2.grad.zero_()
        b_1.data -= alpha * b_1.grad
        b_1.grad.zero_()
        b_2.data -= alpha * b_2.grad
        b_2.grad.zero_()
        
        # checking to see if it actually learned here
        linear_1 = x@w_1 + b_1
        relu_out = relu(linear_1)
        linear_2 = relu_out@w_2 + b_2
        loss = mse(y, linear_2)
        if sample % 500 == 0:
            print(f'Loss after update: {loss}')
        
        reg_1 = l2_reg(w_1, 0.001)
        reg_2 = l2_reg(w_2, 0.001)
        sum_1 = reg_1 + reg_2
        objective = loss + sum_1
      
# The linear transform matrix for me
w_1@w_2
    

Loss after forward pass: 0.4959777891635895
Loss after update: 0.39817294478416443
Loss after forward pass: 2.3894897367426893e-06
Loss after update: 1.79254627141745e-07
Loss after forward pass: 4.762955995829543e-06
Loss after update: 1.7232336801953352e-07
Loss after forward pass: 1.0901844689215068e-06
Loss after update: 1.609981268302363e-07
Loss after forward pass: 3.544995252013905e-06
Loss after update: 4.5765723655222246e-08
Loss after forward pass: 1.1767458545364207e-06
Loss after update: 1.447719455427432e-07
Loss after forward pass: 1.1640510138022364e-06
Loss after update: 8.446831145647593e-08
Loss after forward pass: 8.347192306246143e-07
Loss after update: 1.1989297377112962e-07
Loss after forward pass: 4.2602039229677757e-07
Loss after update: 9.71062377175258e-08
Loss after forward pass: 6.999627544246323e-07
Loss after update: 1.007149421639042e-07
Loss after forward pass: 1.2803931781490974e-07
Loss after update: 9.837253145406066e-08
Loss after forward pass: 5.372

tensor([[-6.0776e-04,  9.9481e-01],
        [-9.9733e-01,  2.1265e-04]], grad_fn=<MmBackward0>)

## Same Algorithm with batches

In [6]:
EPOCHS = 100
BATCH_SIZE = 100

# input 1x2
# weights_1 = 2x3
# weights_2 = 3x2
# bias = 1x3
# hidden = 1x3
# output = 2
# linear: input@ weights = 1x3
# relu: 1x3

w_1 = torch.rand((2, 3)) * 0.1
w_2 = torch.rand((3, 2)) * 0.1
b_1 = torch.zeros((1, 3)) 
b_2 = torch.zeros((1, 2))
w_1.requires_grad = True
b_1.requires_grad = True
w_2.requires_grad = True
b_2.requires_grad = True

alpha = 0.1


def relu(x):
    return torch.max(torch.zeros_like(x), x)

def mse(y_true, y_pred):
    ret = torch.mean((y_pred - y_true)**2)
    
    return ret

def l2_reg(matrix, lambda_val=0):
    l2_norm = torch.sum(torch.square(matrix))
    l2_loss = lambda_val * l2_norm
    return l2_loss
counter = 0
for i in range(EPOCHS):
    for sample in range(0, input_x.shape[1], BATCH_SIZE):
        counter += 1
        
        x = input_x[:,sample:sample+BATCH_SIZE].T
        y = target[:,sample:sample+BATCH_SIZE].T
        
        linear_1 = x@w_1 + b_1
        relu_out = relu(linear_1)
        linear_2 = relu_out@w_2 + b_2
        loss = mse(y, linear_2)
        
        if sample % 1000 == 0:
            print(f'Loss after forward pass: {loss}')
        reg_1 = l2_reg(w_1, 0.001)
        reg_2 = l2_reg(w_2, 0.001)
        sum_1 = reg_1 + reg_2
        print(type(sum_1))
        objective = loss + sum_1
        objective.backward()
        
        # update params
        w_1.data -= alpha * w_1.grad
        w_1.grad.zero_()
        w_2.data -= alpha * w_2.grad
        w_2.grad.zero_()
        b_1.data -= alpha * b_1.grad
        b_1.grad.zero_()
        b_2.data -= alpha * b_2.grad
        b_2.grad.zero_()
    
        # checking if it actually learned
        linear_1 = x@w_1 + b_1
        relu_out = relu(linear_1)
        linear_2 = relu_out@w_2 + b_2
        loss = mse(y, linear_2)
        if sample % 1000 == 0:
            print(f'Loss after update: {loss}')
        
        reg_1 = l2_reg(w_1, 0.001)
        reg_2 = l2_reg(w_2, 0.001)
        sum_1 = reg_1 + reg_2
        objective = loss + sum_1
      
print(counter)   
w_1@w_2

Loss after forward pass: 1.1226557493209839
<class 'torch.Tensor'>
Loss after update: 1.1197400093078613
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Loss after forward pass: 1.1018232107162476
<class 'torch.Tensor'>
Loss after update: 1.0976587533950806
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Loss after forward pass: 1.052945852279663
<class 'torch.Tensor'>
Loss after update: 1.0435094833374023
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Loss after forward pass: 0.9383009076118469
<class 'torch.Tensor'

tensor([[ 2.0896e-04,  9.9829e-01],
        [-1.0069e+00, -2.5610e-03]], grad_fn=<MmBackward0>)

## Discussion

Both the model with batch size and no batch size were able to learn the weights quite well. The batch size one, however was able to find the correct weights a lot faster. Both got to matrices that were very close to the expected one. Both produce a relatively low loss as well.

#### Appendix (not important for grading)
an almost finished neural network class sort of

In [7]:
class MultiLayerPerceptron():
    def __init__(self, num_inputs=2, num_layers=[4], activations=[],num_outputs=2):
        """

        Args:
            num_inputs (int, optional): _description_. Defaults to 2.
            num_layers (list, optional): _description_. Defaults to [4].
                each element in array describes the number of nodes in that layer
            activations (list, optional): _description_. Defaults to ReLU
            num_outputs (int, optional): _description_. Defaults to 2.
        """
        self.num_inputs = num_inputs
        self.num_layers = num_layers
        self.num_outputs = num_outputs
        self.activations = activations
        
        all_layers = [num_inputs] + num_layers + [num_outputs]
        
        # init weights
        weights = []
        for i in range(len(all_layers) - 1):
            w = torch.rand((all_layers[i], all_layers[i+1]))
            w.requires_grad = True
            weights.append(w)
        self.weights = weights
        
        # init biases
        biases = []
        for i in range(len(num_layers)):
            b = torch.rand((1, num_layers[i]))
            b.requires_grad = True
            biases.append(b)
        self.biases = biases
        
    def forward(self, inputs):
        x = inputs
        # forward pass through hidden layers
        for i in range(len(self.num_layers)):
            print(i)
            temp = x@self.weights[i] + self.biases[i]
            func = self.activations[i]
            x = func(temp)
        
        # last weight layer plus output activation function
        temp = x@self.weights[-1] 
        func = self.activations[-1]
        output = func(temp)
        
        return output
    
    def train(self, inputs, outputs, epochs, batch_size, learning_rate, loss):
        for epoch in range(epochs):
            print(f'Epoch {epoch}')
            for sample in range(0, len(inputs), batch_size):
                x = inputs[:,sample:sample+batch_size].T
                y = outputs[:,sample:sample+batch_size].T

                
                output_forward_pass = self.forward(x)
                
                                
                reg_sum = 0
                # regularization
                for w in self.weights:
                    reg_sum += l2_reg(w)

                l = loss(output_forward_pass, y)
                    
                obj = l + reg_sum
                
                obj.backward()
                
                # back prop
                for w in self.weights:
                    w.data -= learning_rate * w.grad
                    w.grad.zero_()
                
                for b in self.biases:
                    b.data -= learning_rate * b.grad
                    b.grad.zero_()




def passthrough(x):
    return x

mlp = MultiLayerPerceptron(num_inputs=2, num_layers=[3], activations=[relu, passthrough], num_outputs=2)


mlp.train(inputs=input_x, outputs=target, epochs=200, batch_size=100, learning_rate=0.1, loss=mse)
mlp.weights[0]@mlp.weights[1]
        

Epoch 0
0
Epoch 1
0
Epoch 2
0
Epoch 3
0
Epoch 4
0
Epoch 5
0
Epoch 6
0
Epoch 7
0
Epoch 8
0
Epoch 9
0
Epoch 10
0
Epoch 11
0
Epoch 12
0
Epoch 13
0
Epoch 14
0
Epoch 15
0
Epoch 16
0
Epoch 17
0
Epoch 18
0
Epoch 19
0
Epoch 20
0
Epoch 21
0
Epoch 22
0
Epoch 23
0
Epoch 24
0
Epoch 25
0
Epoch 26
0
Epoch 27
0
Epoch 28
0
Epoch 29
0
Epoch 30
0
Epoch 31
0
Epoch 32
0
Epoch 33
0
Epoch 34
0
Epoch 35
0
Epoch 36
0
Epoch 37
0
Epoch 38
0
Epoch 39
0
Epoch 40
0
Epoch 41
0
Epoch 42
0
Epoch 43
0
Epoch 44
0
Epoch 45
0
Epoch 46
0
Epoch 47
0
Epoch 48
0
Epoch 49
0
Epoch 50
0
Epoch 51
0
Epoch 52
0
Epoch 53
0
Epoch 54
0
Epoch 55
0
Epoch 56
0
Epoch 57
0
Epoch 58
0
Epoch 59
0
Epoch 60
0
Epoch 61
0
Epoch 62
0
Epoch 63
0
Epoch 64
0
Epoch 65
0
Epoch 66
0
Epoch 67
0
Epoch 68
0
Epoch 69
0
Epoch 70
0
Epoch 71
0
Epoch 72
0
Epoch 73
0
Epoch 74
0
Epoch 75
0
Epoch 76
0
Epoch 77
0
Epoch 78
0
Epoch 79
0
Epoch 80
0
Epoch 81
0
Epoch 82
0
Epoch 83
0
Epoch 84
0
Epoch 85
0
Epoch 86
0
Epoch 87
0
Epoch 88
0
Epoch 89
0
Epoch 90
0
Epoch 91


tensor([[ 0.0033,  1.1279],
        [-1.0706,  0.0211]], grad_fn=<MmBackward0>)