In [None]:
# This was my first implementation of a neural network. Some important things to note:
    # HORRIBLY INEFFICIENT. I was in my mid-early years of high school making this, and 
    # i didn't know about matrices, and i actually used forward and not backward propagation.

    # By forward propagation, i mean that, to perform gradient descent, I kept track of each 
    # partial derivative separately and went forward through the network, summing 
    # the partial derivatives at the end.

    # I did this because I didn't know that you were meant to use backward propagation.
    # I only knew basic theory about derivatives, and tried to figure the rest out myself.

    # That led to this monstrosity. 
    # Check the other file for a 10000x better numpy implementation.

    # As a note, I'm actually glad I did my neural network this way at first.
    # It taught me a lot about neural networks and introduced me to matrices.

In [None]:
import math
import random
import copy
import numpy as np
import pandas as pd

In [2]:
class Perceptron:
    def __init__(self, sizes, activation_functions, leak=0.1):
        
        # Initialize weights randomly, initialize biases as 0
        self.weights = [[[random.random() for i in range(sizes[i])] for j in range(v)] for i,v in enumerate(sizes[1:])]
        self.biases = [[1]*v for v in sizes[1:]]
        self.activation_functions = activation_functions   
        self.leak = leak
        
        self.func_dict = {
            "sigmoid": lambda x: 1/(1+math.e**(-x)),
            "relu": lambda x: max(0, x),
            "leaky_relu": lambda x: max(self.leak*x, x),
            "tanh": lambda x: 2/(1+math.e**(-2*x))-1,
            "mean_squared_error": lambda a, y: (a-y)**2
        }
        
        self.deriv_dict = {
            "sigmoid": lambda x: 1/(1+math.e**(-x)) * (1 - 1/(1+math.e**(-x))),
            "relu": lambda x: 1*(x>0),
            "leaky_relu": lambda x: 1*(x>0)+self.leak*(x<0),
            "tanh": lambda x: 1- (2/(1+math.e**(-2*(2/(1+math.e**(-2*x))-1)))-1),
            "mean_squared_error": lambda a, y: 2*(a-y)
        }

        
    def forward_pass(self, input_activations):
        
        output = [[[None, i] for i in input_activations]]
        
        for li, l in enumerate(self.weights):
            activation_function = self.func_dict[self.activation_functions[li]]
            output.append([])
            for ni, n in enumerate(l):
                z = sum([w*input_activations[wi] for wi, w in enumerate(n)]) + self.biases[li][ni]
                a = activation_function(z)
                output[-1].append([z, a])
            input_activations = [i[1] for i in output[-1]]
        
        return output
    
    def output(self, input_activations):
        return [i[1] for i in self.forward_pass(input_activations)[-1]]
    
    def backprop(self, x, y, loss="mean_squared_error", learning_rate=1):
        output = self.forward_pass(x)
        weight_calculations = copy.deepcopy(self.weights)
        bias_calculations = copy.deepcopy(self.biases)
        
        # li means layer index, l means layer
        # ni means neuron index, n means neuron
        # This usually refers to the weights connected to a neuron from the previous layer
        # wi means weight index, w means weight
        # bi means bias index, b means bias
        # ci means calculation index, c means calculation
        # This means the different possible derivatives for a weight or bias
        # The letter p in front of these prefices means "previous"
        
        # For each layer, 
        for li, l in enumerate(weight_calculations):
            
            # Update weight calculations in previous layers
            for pli, pl in enumerate(weight_calculations[:li]):
                for pni, pn in enumerate(pl):
                    for pwi, pw in enumerate(pn):
                        # Checks if it is a single calculation or multiple.
                        if type(pw) == float or type(pw) == int or type(pw) == np.float64:
                            weight_calculations[pli][pni][pwi] = [pw]
                            for ni, n in enumerate(l):
                                w = weight_calculations[li][ni][pni]
                                activation_function = self.activation_functions[li]
                                activation_deriv = self.deriv_dict[activation_function]
                                z = output[li+1][ni][0]
                                weight_calculations[pli][pni][pwi].append(pw*w*activation_deriv(z))
                            del weight_calculations[pli][pni][pwi][0]
                        else:
                            pw = pw.copy()
                            for ci, c in enumerate(pw):
                                origin = int(ci//(len(pw)/len(weight_calculations[li-1])))
                                for ni, n in enumerate(l):
                                    w = weight_calculations[li][ni][origin]
                                    activation_function = self.activation_functions[li]
                                    activation_deriv = self.deriv_dict[activation_function]
                                    z = output[li+1][ni][0]
                                    weight_calculations[pli][pni][pwi].append(c*w*activation_deriv(z))
                            del weight_calculations[pli][pni][pwi][0:len(pw)]

                            
            # Initialize calculation for weights in the current layer
            for ni, n in enumerate(l):
                for wi, w in enumerate(n):
                    activation_function = self.activation_functions[li]
                    activation_deriv = self.deriv_dict[activation_function]
                    z = output[li+1][ni][0]
                    a = output[li][wi][1]
                    weight_calculations[li][ni][wi] = a*activation_deriv(z)
        
    
            # Update bias calculations in previous layers
            for pli, pl in enumerate(bias_calculations[:li]):
                for pbi, pb in enumerate(pl):
                    if type(pb) == float or type(pb) == int or type(pb) == np.float64:
                        bias_calculations[pli][pbi] = [pb]
                        for ni, n in enumerate(l):
                            w = self.weights[li][ni][pbi]
                            activation_function = self.activation_functions[li]
                            activation_deriv = self.deriv_dict[activation_function]
                            z = output[li+1][ni][0]
                            bias_calculations[pli][pbi].append(pb*w*activation_deriv(z))
                        del bias_calculations[pli][pbi][0]
                    else:
                        pb = pb.copy()
                        for ci, c in enumerate(pb):
                            origin = int(ci//(len(pb)/len(self.weights[li-1])))
                            for ni, n in enumerate(l):
                                w = self.weights[li][ni][origin]
                                activation_function = self.activation_functions[li]
                                activation_deriv = self.deriv_dict[activation_function]
                                z = output[li+1][ni][0]
                                bias_calculations[pli][pbi].append(c*w*activation_deriv(z))
                        del bias_calculations[pli][pbi][0:len(pb)]

                        
            # Initialize calculation for biases in the current layer
            for bi, b in enumerate(bias_calculations[li]):
                activation_function = self.activation_functions[li]
                activation_deriv = self.deriv_dict[activation_function]
                z = output[li+1][bi][0]
                bias_calculations[li][bi] = activation_deriv(z)
                                        
        
    #         Take sum of paths for weights
#             Multiply by deriv of loss
        loss_deriv = self.deriv_dict[loss]
        for li, l in enumerate(weight_calculations):
            for ni, n in enumerate(l):
                for wi, w in enumerate(n):
                    if type(w) == float or type(w) == int or type(w) == np.float64:
                        y_pred = output[li+1][ni][1]
                        y_true = y[ni]
                        weight_calculations[li][ni][wi] *= loss_deriv(y_pred, y_true)
                    else:
                        for ci, c in enumerate(w):
                            origin = int(ci//(len(w)/len(weight_calculations[-1])))
                            y_pred = output[-1][origin][1]
                            y_true = y[origin]
                            weight_calculations[li][ni][wi][ci] *= loss_deriv(y_pred, y_true)
                        weight_calculations[li][ni][wi] = sum(weight_calculations[li][ni][wi])
                    # Minus derivatives from the weights
                    self.weights[li][ni][wi] -= weight_calculations[li][ni][wi]*learning_rate
                    
        
#         Take sum of paths for biases
#             Multiply by deriv of loss
        for li, l in enumerate(bias_calculations):
            for bi, b in enumerate(l):
                if type(b) == float or type(b) == int or type(b) == np.float64:
                    y_pred = output[li+1][bi][1]
                    y_true = y[bi]
                    bias_calculations[li][bi] *= loss_deriv(y_pred, y_true)
                else:
                    for ci, c in enumerate(b):
                        origin = int(ci//(len(b)/len(weight_calculations[-1])))
                        y_pred = output[-1][origin][1]
                        y_true = y[origin]
                        bias_calculations[li][bi][ci] *= loss_deriv(y_pred, y_true)
                    bias_calculations[li][bi] = sum(weight_calculations[li][bi])
                # Minus derivatives from the biases
                self.biases[li][bi] -= bias_calculations[li][bi]*learning_rate

In [None]:
# Train a tiny neural net to translate [1, 3] into [0.1, 0.5]
model = Perceptron([2, 2, 2, 2], ["leaky_relu", "leaky_relu", "sigmoid"])
for i in range(100): 
    model.backprop([1, 3], [0.1, 0.5])
model.output([1, 3])

In [None]:
# Here I attempted to run my neural net on the full MNIST dataset
# Because this was so inefficient, i realized i did my implementation very inefficiently

model = Perceptron([784, 16, 16, 10], ["leaky_relu", "leaky_relu", "sigmoid"])

train = pd.read_csv("train.csv")

for i,v in enumerate(list(train["label"])):
    y = [0]*(v)+[1]+[0]*(9-v)
    x = list(train.iloc[i])[1:]
    model.backprop(x, y)
    break # Break statement here put so you don't waste too much time...