### Packages

In [362]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

### Generating Data

In [None]:
x1 = np.random.normal(loc=5, scale=2, size=40) # generate 40 values from Normal(5, 2)
x2 = np.random.normal(loc=10, scale=2, size=40) # generate 40 values from Normal(10, 2)

# define coeffients to use for linear combination
beta_0 = 3
beta_1 = 0.5
beta_2 = 0.75

# Define X and y
X = np.concatenate([x1.reshape(40, 1), x2.reshape(40, 1)], axis=1)
y = beta_0 + (beta_1 * X[:, 0]) + (beta_2 * X[:, 1]) 

In [None]:
plt.scatter(X[:, 0], X[:, 1], cmap=y)
plt.xlim(0, 10)
plt.ylim(0, 20)

In [None]:
# get (x1, x2), the first data point
x_vec = X[0, :]

# hide the bias term in the x by adding a '1' to the end: (x1, x2, 1)
x_ready = np.concatenate([x_vec, [1]])


X_batch = X
X_batch.shape

### Functions

In [375]:
def ReLU(vec):
    return np.where(vec <= 0, 0, vec)


def ReLU_derivative(vec):
    return np.where(vec <= 0, 0, 1)


def leaky_ReLU(vec):
    return np.where(vec <= 0, 0.1 * vec, vec)


def loss(num, y):
    return (1/2) * ((num - y) ** 2)


def average_batch_loss(vec, y):
    B = vec.shape[0]
    losses = loss(vec, y)
    return (1 / B) * losses.sum()


def get_batch(X, y, batch_size):
    indices = np.random.randint(X.shape[0], size=batch_size)
    return X[indices, :], y[indices]

In [None]:
# input layer is 3 x 1
# lets say width of hidden layer is 4
# then w1 is 3 x 4, resulting in hidden layer having dimensions 4 x 1
# to get 4 x 1 to 1 x 1, w2T has to be 4 x 1

# below is a neural network with depth = 2 (2 hidden layers), and width = 4 (each hidden layer has 4 neurons)
true_y = y[0]
# basically, take in an x value, and with our random weights, do some random shit to it until we get out one y value
# then, keep making the shit we do to it less and less random by training the weights with stochastic gradient descent until it can predict y well

x = x_ready                                      # x_vec
                                                 #  | \ \ \ \ 
w1 = np.random.normal(loc=2, size=(3, 4))        # | | \ \ \ \    apply w1
h1 = np.transpose(w1).dot(x)                     # 0 0  0  0 0 0  h1(x)
sigma_h1 = ReLU(h1)                              #                sigma(h1(x))
w2 = np.random.normal(loc=1.5, size=(4, 4))      # | | | | | | |  apply w2
h2 = np.transpose(w2).dot(h1)                    # 0 0  0  0 0 0  h2(sigma(h1(x)))
sigma_h2 = ReLU(h2)                              #                sigma(h2(sigma(h1(x)))) 
w3 = np.random.normal(loc=1, size = 4)           # \ \ | | / / /  apply z, now at z(sigma(h2(sigma(h1(x)))))             meaning w3 to get to a scalar
z = np.transpose(w3).dot(h2)                     #      0         
this_loss = (1/2) * ((z - true_y) ** 2)               #     L()        calculate loss: L(z(sigma(h2(sigma(h1(x))))))


print(z)
print(this_loss)

**Types of Layers:**
1. Loss layer (final layer)
2. Ouput layer (single scalar value z)
3. Activation layer (activation function sigma applied to hidden layer)
4. Hidden layer
5. Input layer

In [None]:
class LossLayer:
    def __init__(self, avg_loss=None, y_vec=None):
        self.avg_loss = None
        self.y_vec = y_vec
        
    def __repr__(self):
        return '{layer type: Loss, average loss: ' +  (str(self.avg_loss) if self.avg_loss is not None else "None") +  ', true y values shape: ' + (str(self.y_vec.shape) if self.y_vec is not None else "None") + '}'
    
    def __str__(self):
        return repr(self)
    
    
class OutputLayer:
    def __init__(self, array=None, weights=None):
        self.array = array
        self.weights = weights
        
    def __repr__(self):
        return '{layer type: Output, array shape: ' +  (str(self.array.shape) if self.array is not None else "None") + ', weights shape: ' + (str(self.weights.shape) if self.weights is not None else "None") + '}'
    
    def __str__(self):
        return repr(self)
    

class ActivationLayer:
    def __init__(self, array=None):
        self.array = array
        
    def __repr__(self):
        return '{layer type: Activation, sigma array shape: ' + (str(self.array.shape) if self.array is not None else "None") + '}'
    def __str__(self):
        return repr(self)
    
    
class HiddenLayer:
    def __init__(self, array=None, weights=None):
        self.array = array
        self.weights = weights
        
    def __repr__(self):
        return '{layer type: Hidden, array shape: ' + (str(self.array.shape) if self.array is not None else "None") + ', weights shape: ' + (str(self.weights.shape) if self.weights is not None else "None") + '}'

### Implementing for a Batch, not a vector:

In [None]:
X_batch
# Input is of size B x M, where B = 40 and M = 3
# To output a matrix of size B x N, w must have size M x N, compute Xw instead of wTX

w1 = np.random.normal(loc=1, scale=2, size=(3, 4))

h1 = X_batch.dot(w1)

w2 = np.random.normal(loc=1, scale=2, size=(4, 4))

h2 = h1.dot(w2)

w3 = np.random.normal(loc=1, scale=2, size=(4, 1))

h2.dot(w3).shape

In [393]:
class NeuralNetwork:
    def __init__(self, depth, width, input_shape):
        self.depth = depth
        self.width = width
        self.B = input_shape[0]
        self.M = input_shape[1] + 1 # add one for the bias term so the weights are the correct dimensions
        self.layers = []
        self.num_hyperparameters = 0
        
        # Initializing the layers in the network
        
        for i in range(self.depth):
            # make the hidden layer first
            if i == 0: # if this is the first hidden layer
                # then the number of rows in weight has to match input size
                h_layer = HiddenLayer(weights=np.random.normal(loc=1, size=(self.M, self.width)))
                self.num_hyperparameters += self.M * self.width
            else: # if this is not the first hidden layer
                # then weights should be of dimension (width x width)
                h_layer = HiddenLayer(weights=np.random.normal(loc=1, size=(self.width, self.width)))
                self.num_hyperparameters += self.width * self.width
            
            # then make the activation layer for that hidden layer
            sigma_layer = ActivationLayer()
            
            self.layers.append(h_layer)
            self.layers.append(sigma_layer)
            
        # make an output layer that creates a scalar with its weights
        z_layer = OutputLayer(weights=np.random.normal(loc=1, size=(self.width, 1)))
        self.num_hyperparameters += self.width * 1
        self.layers.append(z_layer)
        
        # make a loss layer
        loss_layer = LossLayer()
        self.layers.append(loss_layer)
    
    def __repr__(self):
        return '[' + '\n'.join([str(layer) for layer in self.layers]) + ']'
    
    def __str__(self):
        return repr(self)
    
    def forward_pass(self, X_batch, y_vec):
        
        # Append column of 1s to end of this batch, to hide bias term
        X_w_bias = np.concatenate([X_batch, np.ones(shape=(self.B, 1))], axis=1)
        i = 0
        
        # first apply weights to input layer and save result in first hidden layer
        self.layers[0].array = X_w_bias.dot(self.layers[0].weights)
        
        i += 1
        
        while i < len(self.layers):
            if isinstance(self.layers[i], HiddenLayer):
                # then apply this layer's weights to the previous layer and save result in current layer
                self.layers[i].array = self.layers[i-1].array.dot(self.layers[i].weights)
            elif isinstance(self.layers[i], ActivationLayer):
                # then apply activation function to previous layer and save result
                self.layers[i].array = ReLU(self.layers[i-1].array)
            elif isinstance(self.layers[i], OutputLayer):
                # then apply z weighting to get to B x 1 vector, save result
                self.layers[i].array = self.layers[i-1].array.dot(self.layers[i].weights)
            else: # if this is a LossLayer
                # then apply loss function to B x 1 vector to get 1 x 1 average loss over this batch
                self.layers[i].avg_loss = average_batch_loss(self.layers[i-1].array, y_vec)
                self.layers[i].y_vec = y_vec
                
            i += 1
            
    
    def backward_pass(self, X_batch, learning_rate):
        X_w_bias = np.concatenate([X_batch, np.ones(shape=(self.B, 1))], axis=1)
        
        i = len(self.layers) - 1 # start at the loss layer
        
        while i > 0: # while we still have layers to traverse through backwards that aren't the first hidden layer
            #print("Layer Index: ", i)
            if isinstance(self.layers[i], LossLayer): # if this is a loss layer:
                input_x_vec = self.layers[i-1].array # get the x values that were inputted to this loss function
                J = input_x_vec - self.layers[i].y_vec.reshape(self.B, 1) # the derivative of our loss function is just x - y
            elif isinstance(self.layers[i], ActivationLayer): # if this is an activation layer:
                J_update = ReLU_derivative(self.layers[i-1].array) # calcluate derivative (of ReLU) of every value that was inputted to this layer
                J = J * J_update # update J with these values
            else: # if this is a hidden layer or output layer (anything that has weights)
                weight_update = np.transpose(self.layers[i-1].array).dot(J) # calculate the weight update by doing xT * J
                J = J.dot(np.transpose(self.layers[i].weights)) # Update the Jacobian by doing J * wT
                self.layers[i].weights = self.layers[i].weights - (learning_rate * weight_update) # apply the weight update, multiplying by learning rate first
                
            #print("Jacobian shape after this iteration:", J.shape)
            i -= 1
            
        # now we update the weights of the very first hidden layer (it has special weight dimensions)
        # so i = 0
        
        # If I don't update the Jacobian on this final layer, then it works out well. I guess I don't need to update jacobian here because
        # I'm never going to use it again, so this works. 
        
        
        #print("Layer Index: ", i)
        weight_update = np.transpose(X_w_bias).dot(J) # result is 3 x 3
        self.layers[i].weights = self.layers[i].weights - (learning_rate * weight_update) # apply the weight update, multiplying by learning rate first
        J = J.dot(np.transpose(self.layers[i].weights)) # result is 40 x 3
        #print("Jacobian shape after this iteration:", J.shape)
        
        
    def get_avg_batch_loss(self):
        return self.layers[-1].avg_loss

In [None]:
net = NeuralNetwork(depth=2, width=4, input_shape=(40, 2))
net.forward_pass(X_batch, y)
net.layers

In [None]:
net.backward_pass(X_batch, 0.001)

### Trying the Training Process

#### Creating 2000 rows of data

In [360]:
# define x features
x1 = np.random.normal(loc=5, scale=2, size=2000) # generate 40 values from Normal(5, 2)
x2 = np.random.normal(loc=10, scale=2, size=2000) # generate 40 values from Normal(10, 2)

# define coeffients to use for linear combination
beta_0 = 3
beta_1 = 0.5
beta_2 = 0.75

# Define X and y
X = np.concatenate([x1.reshape(2000, 1), x2.reshape(2000, 1)], axis=1)
y = beta_0 + (beta_1 * X[:, 0]) + (beta_2 * X[:, 1])  # make a linear combination of x1 and x2 to be y

#### Defining Network

In [424]:
batch_size = 128
learning_rate = 0.1

train_net = NeuralNetwork(depth=2, width=4, input_shape=(batch_size, X.shape[1]))
train_net.layers

[{layer type: Hidden, array shape: None, weights shape: (3, 4)},
 {layer type: Activation, sigma array shape: None},
 {layer type: Hidden, array shape: None, weights shape: (4, 4)},
 {layer type: Activation, sigma array shape: None},
 {layer type: Output, array shape: None, weights shape: (4, 1)},
 {layer type: Loss, average loss: None, true y values shape: None}]