# Deep Learning Framework
By Luke Doughty  
Derived from Grokking Deep Learning by Andrew W. Trask. Much of it is directly copied.  
This is meant to build intuition on the inner workings of many deep learning frameworks like Pytorch
## Why tensors?
Tensors are abstract versions of vectors and matrixes. Vector are one dimensional tensors and matrixes are two dimensional tensors.  
The inputs, outputs, and operations are all represented with tensors (vector in, matrix transformation/operation, vector out). We set up our code so we can stack tensors on top of another exactly like layers in our neural network. We have specific instructions on how to backpropagate each type of layer, so we can automatically backpropagate and focus our engineering efforts on forward propagation.
## Why don't we have to worry about dimensions?
We do. We input the dimensions when we create our weights. When we create a layer, we input the number of inputs and outputs to that layer, and the weights are generated in the constructor occordingly.


In [36]:
import numpy as np

class Tensor (object):
    def __init__(self, data, 
                autoGrad = False,  # autoGrad asks if this tensor should do gradient descent. Useful for drop out regularization.
                creators = None, 
                creation_operation = None, 
                id = None):
        self.data = np.array(data) # What is shape is data?
        self.creation_operation = creation_operation
        self.creators = creators
        self.gradient = None
        self.autoGrad = autoGrad
        self.children = {}

        if (id is None):
            id = np.random.randint(0, 100000)
        self.id = id

        if (creators is not None):
            for creator in creators: # creators is a dictionary where the child's id is the key, and the value at that key tells the number of children the creator tensor has with the current tensor's id.
                # keeps track of how many children a tensor has.
                if (self.id not in creator.children):
                    creator.children[self.id] = 1
                else: # should never go down this branch. That would mean that more than one children have been made with the same id.
                    creator.children[self.id] += 1

    # Checks whether a tensor has received the correct number of gradients from each child.
    def all_children_gradients_accounted_for(self):
        for id, count in self.children.items():
            if(count != 0):
                return False
        return True

    def backpropagate(self, gradient, gradient_origin = None):
        if(self.autoGrad):
            if (gradient is None):
                gradient = Tensor(np.ones_like(self.data)) # so that we don't have to pass a gradient of 1 the first time we call .backpropagate()
            if(gradient_origin is not None):
                # checks to make sure you can backpropagate or whether youre waiting for a gradient, in which case decrement the counter.
                if(self.children[gradient_origin.id] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[gradient_origin.id] -= 1

            if (self.gradient is None):
                self.gradient = gradient
            else:
                # if we're getting passed gradients from multiple children, add them.
                self.gradient += gradient

            if (self. creators is not None and (self.all_children_gradients_accounted_for() or gradient_origin is None)):
                # here is where we actually start backpropagating.
                if(self.creation_operation == "add"): # TODO: make this either elif or a switch case
                    # backpropagate to both parents/creators. 
                    self.creators[0].backpropagate(gradient, self)
                    self.creators[1].backpropagate(gradient, self)
                elif(self.creation_operation == "neg"):
                    self.creators[0].backpropagate(self.gradient.__neg__())
                elif(self.creation_operation == "sub"):
                    gradient_0 = Tensor(self.gradient.data)
                    self.creators[0].backpropagate(gradient_0, self)
                    gradient_1 = Tensor(self.gradient.__neg__().data)
                    self.creators[1].backpropagate(gradient_1, self)
                elif(self.creation_operation == "mul"):
                    gradient_0 = self.gradient * self.creators[1]
                    self.creators[0].backpropagate(gradient_0, self)
                    gradient_1 = self.gradient * self.creators[0]
                    self.creators[1].backpropagate(gradient_1, self)
                elif(self.creation_operation == "dot"):
                    activation = self.creators[0]
                    weights = self.creators[1]
                    activation_gradient = self.gradient.dot(weights.transpose())
                    activation.backpropagate(activation_gradient)
                    weights_gradient = self.gradient.transpose().dot(activation).transpose()
                    weights.backpropagate(weights_gradient)
                elif(self.creation_operation == "transpose"):
                    self.creators[0].backpropagate(self.gradient.transpose())
                elif("sum" in self.creation_operation):
                    # since the operation is "sum" + str(dimension)
                    dimension = int(self.creation_operation.split("_")[1])
                    data_shape = self.creators[0].data.shape[dimension]
                    self.creators[0].backpropagate(self.gradient.expand(dimension, data_shape))
                elif("expand" in self.creation_operation):
                    dimension = int(self.creation_operation.split("_")[1])
                    self.creators[0].backpropagate(self.gradient.sum(dimension))
                elif(self.creation_operation == "sigmoid"):
                    ones = Tensor(np.ones_like(self.gradient.data))
                    # passes the child's gradient multiplied by the derivative of sigmoid at the sigmoid activation output (ie 'self') to calculate the parent's gradient.
                    self.creators[0].backpropagate(self.gradient * (self * (ones - self)))
                elif(self.creation_operation == "tanh"):
                    ones = Tensor(np.ones_like(self.gradient.data))
                    # passes the child's gradient multiplied by the derivative of tanh at the tanh activation output (ie 'self') to calculate the parent's gradient.
                    self.creators[0].backpropagate(self.gradient * (ones - (self * self)))
                elif(self.creation_operation == "hardTanh"):
                    max_value, min_value = (1, -1)
                    self.creators[0].backpropagate(self.gradient * (self > min_value and self < max_value))
                elif(self.creation_operation == "relu"):
                    self.creators[0].backpropagate(self.gradient * (self > 0))


    # add together two tensors
    def __add__(self, other):
        if (self.autoGrad and other.autoGrad):
            return Tensor(self.data + other.data, autoGrad = True, creators = [self, other], creation_operation = "add")
        return Tensor(self.data + other.data)
    
    # negates the given tensor. Flips the signs.
    def __neg__(self):
        if (self.autoGrad):
            return Tensor(self.data * -1, autoGrad = True, creators = [self], creation_operation = "neg")
        return Tensor(self.data * -1)

    # subtract one tensor from another
    def __sub__(self, other):
        if (self.autoGrad and other.autoGrad):
            return Tensor(self.data - other.data, autoGrad = True, creators = [self, other], creation_operation = "sub")
        return Tensor(self.data - other.data)

    # multiply two tensors
    def __mul__(self, other):
        if (self.autoGrad and other.autoGrad):
            return Tensor(self.data * other.data, autoGrad = True, creators = [self, other], creation_operation = "mul")
        return Tensor(self.data * other.data)

    # collapses a tensor along a given dimension, adding all numbers along that dimension.
    def sum(self, dimension):
        if (self.autoGrad):
            return Tensor(self.data.sum(dimension), autoGrad = True, creators = [self], creation_operation = "sum_" + str(dimension))
        return Tensor(self.data.sum(dimension))

    # expand a tensor along a given dimension, creating copies of the tensor stacked along the given dimension.
    def expand(self, dimension, copies):
        # transposition_command tells the order of dimensions for the expanded tensor.
        transposition_command = list(range(0, len(self.data.shape)))
        transposition_command.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(transposition_command)

        if (self.autoGrad):
            return Tensor(new_data, autoGrad = True, creators = [self], creation_operation = "sum_" + str(dimension))
        return Tensor(self.data.sum(dimension))

    # transpose the tensor. In a 1d and 2d tensors, this means swapping the rows and columns.
    def transpose(self):
        if (self.autoGrad):
            return Tensor(self.data.transpose(), autoGrad = True, creators= [self], creation_operation= "transpose")
        return Tensor(self.data.transpose())

    # dot product of two tensors. Returns a scalar.
    def dot(self, other): # book calls this matrix_multiplication
        if(self.autoGrad):
            return Tensor(self.data.dot(other.data), autoGrad = True, creators = [self, other], creation_operation = "dot")
        return Tensor(self.data.dot(other.data))
    
    # Nonlinearity. Squeezes the values to something between 0 and 1.
    def sigmoid(self):
        if(self.autoGrad):
            return Tensor(1 / (1 + np.exp(-self.data)), autoGrad=True, creators=[self], creation_operation="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))
    
    # Nonlinearity. Squeezes the values to something between -1 and 1
    def tanh(self):
        if(self.autoGrad):
            return Tensor(np.tanh(self.data), autoGrad=True, creators=[self], creation_operation="tanh")
        return Tensor(np.tanh(self.data))
    
    # Nonlinearity. Limits the values to be between a given range. All values greater than the max are set to the max, and all values less than the min are set to the min.
    def hardTanh(self):
        max_value, min_value = (1, -1) # for the sake of making the backpropagation code easier. I could use a similar approach to sum().
        if(self.autoGrad):
            return Tensor(max_value if self > max_value else min_value if self < min_value else self, autoGrad=True, creators=[self], creation_operation="hardTanh")
        return Tensor(max_value if self > max_value else min_value if self < min_value else self)
    
    # Nonlinearity. Sets all negative values to zero
    def relu(self):
        if(self.autoGrad):
            return Tensor(self * self > 0, autoGrad=True, creators=[self], creation_operation="relu")
        return Tensor(self * self > 0)

    # produces the vector as a string, but is supposed to not get rid of any information so an object can be recreated from it.
    # similar to __str__, but __str__ is meant to be human-friendly.
    def __repr__(self):
        return str(self.data.__repr__())

    # prints the tensor as a string
    def __str__(self):
        return str(self.data.__str__())


# Stochastic Gradient Descent Optimizer

In [37]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha

    def zero(self):
        for parameter in self.parameters:
            parameter.gradient.data *= 0
    
    def step(self, zero=True):
        for parameter in self.parameters:
            parameter.data -= parameter.gradient.data * self.alpha
            if (zero):
                parameter.gradient.data *= 0

In [38]:
import numpy as np
np.random.seed(0)
 
data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autoGrad=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autoGrad=True)

weights = list()
weights.append(Tensor(np.random.rand(2,3), autoGrad=True))
weights.append(Tensor(np.random.rand(3,1), autoGrad=True))

optimization = SGD(parameters=weights, alpha=0.1)
for iteration in range(10):
    prediction = data.dot(weights[0]).dot(weights[1]) # I dont think the first dot outputs a scalar. Is matrix multiplication not the same as dot product?
    loss = ((prediction - target) * (prediction - target)).sum(0)
    loss.backpropagate(Tensor(np.ones_like(loss.data)))
    optimization.step()
    print(loss)

[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]


# Layer Types

In [39]:
class Layer(object):
    def __init__(self):
        self.parameters = list()
    
    def get_parameters(self):
        return self.parameters
    
class Linear(Layer):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        weights = np.random.randn(num_inputs, num_outputs) * np.sqrt(2.0 / num_inputs)
        self.weights = Tensor(weights, autoGrad=True)
        self.bias = Tensor(np.zeros(num_outputs), autoGrad=True)

        self.parameters.append(self.weights)
        self.parameters.append(self.bias)
    
    def forward(self, input):
        return input.dot(self.weights) + self.bias.expand(0, len(input.data))
    
class Sequential(Layer):
    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        parameters = list()
        for layer in self.layers:
            parameters += layer.get_parameters()
        return parameters


In [40]:
import numpy as np
np.random.seed(0)
 
data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autoGrad=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autoGrad=True)

model = Sequential([Linear(2, 3), Linear(3, 1)]) # weights are automatically generated in the constructor for these Linear layers. We input the shape of each layer.

optimization = SGD(parameters=model.get_parameters(), alpha=0.05)

for iteration in range(10):
    prediction = model.forward(data)
    loss = ((prediction - target) * (prediction - target)).sum(0)
    loss.backpropagate(Tensor(np.ones_like(loss.data)))
    optimization.step()
    print(loss)

[2.33428272]
[0.62282083]
[0.19680451]
[0.08915535]
[0.06028456]
[0.049625]
[0.04329267]
[0.03828787]
[0.0339512]
[0.03010911]


# Loss Function Layers

In [41]:
class MeanSquaredErrorLoss(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, prediction, target):
        return ((prediction - target) * (prediction - target)).sum(0)

In [42]:
import numpy as np
np.random.seed(0)
 
data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autoGrad=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autoGrad=True)

model = Sequential([Linear(2, 3), Linear(3, 1)]) # weights are automatically generated in the constructor for these Linear layers. We input the shape of each layer.
criterion = MeanSquaredErrorLoss()

optimization = SGD(parameters=model.get_parameters(), alpha=0.05)

for iteration in range(10):
    prediction = model.forward(data)
    loss = criterion.forward(prediction, target) # exactly the same. we just use our MSE loss function object.
    loss.backpropagate(Tensor(np.ones_like(loss.data))) 
    optimization.step()
    print(loss)

[2.33428272]
[0.62282083]
[0.19680451]
[0.08915535]
[0.06028456]
[0.049625]
[0.04329267]
[0.03828787]
[0.0339512]
[0.03010911]


# Nonlinearity Layers

In [43]:
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()

class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()
    
class HardTanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.hardTanh()
    
class Relu(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.relu()

In [44]:
import numpy as np
np.random.seed(0)
 
data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autoGrad=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autoGrad=True)

model = Sequential([Linear(2, 3), Tanh(), Linear(3, 1), Sigmoid()]) # weights are automatically generated in the constructor for these Linear layers. We input the shape of each layer.
criterion = MeanSquaredErrorLoss()

optimization = SGD(parameters=model.get_parameters(), alpha=1)

for iteration in range(10):
    prediction = model.forward(data)
    loss = criterion.forward(prediction, target) # exactly the same. we just use our MSE loss function object.
    loss.backpropagate(Tensor(np.ones_like(loss.data))) 
    optimization.step()
    print(loss)

[1.06372865]
[0.75148144]
[0.57384259]
[0.39574294]
[0.2482279]
[0.15515294]
[0.10423398]
[0.07571169]
[0.05837623]
[0.04700013]
