In [18]:
class A(object):
    def f(self, x):
        self._f(x)
    
    def _f(self, x):
        print(f"A {x}")

class B(A):
    def _f(self, x):
        print(f"B {x}")

b = B()
b.f('a')
b._f('a')

B a
B a


In [19]:
import numpy as np
%load_ext autoreload
%autoreload 2

In [2]:
class Layer(object):
    """
    An abstract class that represents a neural network layer.
    """
    
    def __init__(self, input_shape=None):
        self.input_shape = input_shape
        self.cache = {'X': None}
    
    def build(self):
        """
        Initializes the layer's parameters that depend on the input shape
        """
        pass
    
    def __call__(self, X, *args, **kwargs):
        return self.forward(X, *args, **kwargs)
    
    def forward(self, X, cache=False, *args, **kwargs):
        """
        Computes the layer's output (forward pass)
        
        If cache is True, the input is stored so that it can be used later during
        backprop
        """
        if cache:
            self.cache['X'] = X
    
    def backward(self, dJ_dZ):
        """
        Computes the gradients (backward pass) of the loss function with respect to the
        layer's parameters (if any) and the layer's input
        
        dJ_dZ is the tensor of gradients of the loss function with respect to the layer's
        outputs
        
        When implementing this function for a layer with parameters, you must return dJ_dX
        (the gradients with respect to the input) first in a tuple, then the rest of the
        tuple should contain the gradients with respect to the layer's parameters in the
        same order of the arguments in update_parameters()
        """
        pass
    
    def update_parameters(self, learning_rate, *args, **kwargs):
        """
        Updates the layer's parameters
        """
        
        pass

In [3]:
class Linear(Layer):
    """
    Just your regular fully connected layer
    """
    
    def __init__(self, units, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.units = units
        self.output_shape = (units,)
    
    def build(self, weights=None, biases=None, *args, **kwargs):
        super().build(*args, **kwargs)
        
        if weights is not None:
            self.weights = weights
        else:
            self.weights = np.random.randn(self.input_shape[0], self.units) * np.sqrt(2 / self.input_shape[0])
            
        if biases is not None:
            self.biases = biases
        else:
            self.biases = np.zeros((1, self.units))
    
    def forward(self, X, *args, **kwargs):
        super().forward(X, *args, **kwargs)
        
        Z = X @ self.weights + self.biases
        return Z
    
    def backward(self, dJ_dZ):
        # compute the gradients with respect to the weights and biases
        dJ_dW = self.cache['X'].T @ dJ_dZ
        dJ_db = np.sum(dJ_dZ, axis=0, keepdims=True)
        
        # compute the gradients with respect to the input
        dJ_dX = dJ_dZ @ self.weights.T
        
        return dJ_dX, dJ_dW, dJ_db
    
    def update_parameters(self, dJ_dW, dJ_db, learning_rate):
        self.weights = self.weights - learning_rate * dJ_dW
        self.biases = self.biases - learning_rate * dJ_db

In [4]:
class ReLU(Layer):
    def build(self, *args, **kwargs):
        self.output_shape = self.input_shape
        
    def forward(self, X, *args, **kwargs):
        super().forward(X, *args, **kwargs)
        
        Z = np.maximum(0, X)
        return Z
    
    def backward(self, dJ_dZ):
        dJ_dX = dJ_dZ * np.heaviside(self.cache['X'], 0)
        return dJ_dX

In [5]:
class Sigmoid(Layer):
    def build(self, *args, **kwargs):
        self.output_shape = self.input_shape
        
    def forward(self, X, *args, **kwargs):
        super().forward(X, *args, **kwargs)
        
        Z = 1 / (1 + np.exp(-X))
        return Z
    
    def backward(self, dJ_dZ):
        Z = self.forward(self.cache['X'])
        dJ_dX = dJ_dZ * Z * (1 - Z)
        return dJ_dX

In [105]:
class Softmax(Layer):
    def build(self, *args, **kwargs):
        self.output_shape = self.input_shape
        
    def forward(self, X, *args, **kwargs):
        super().forward(X, *args, **kwargs)
        
        X = X - np.max(X, axis=-1, keepdims=True)
        exp_X = np.exp(X)
        Z = exp_X / exp_X.sum(axis=-1, keepdims=True)
        return Z
    
    def backward(self, dJ_dZ):
        # dJ_dX = dJ_dZ * Z - (dJ_dZ @ Z.T * np.eye(m)) @ Z  # -> needs a lot (m^2) of memory
        # TODO: find a better vectorized implementation
        
        # this computes dJ_dZ[i] * Z[i] - dJ_dZ[i] @ Z[i].T @ Z[i] for each row a little bit
        # more efficiently
        Z = self.forward(self.cache['X'])
        m = Z.shape[0]
        
        mult = np.zeros((m, 1))
        for i in range(m):
            mult[i] = dJ_dZ[i] @ Z[i]
        
        dJ_dX = dJ_dZ * Z - mult * Z
        return dJ_dX

In [8]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [45]:
from layers import Softmax

sm = Softmax()

A = np.array([[-1., 2], [4, 5], [8, 121]])

S = sm.forward(A, cache=True)
print(S)
print(sm.backward(A))

I = np.eye(A.shape[-1])
# print(S * I - S.T @ S)
# print(S[0,0] * (1 - S[0,0]), -S[0,0] * S[0,1])
# print(-S[0,1] * S[0,0], S[0,1] * (1 - S[0,1]))
# print(A[0] @ (np.diag(S[0]) - S[0,None].T @ S[0,None]))
# print(A[1] @ (np.diag(S[1]) - S[1,None].T @ S[1,None]))
# print(A[0] @ S[0])
# print(A[1] @ S[1])
print((A - np.sum(A * S, axis=-1, keepdims=True)) * S)
print()
import torch

torch_A = torch.tensor(A, requires_grad=True)
torch_sm = torch.nn.Softmax(dim=-1)
y = torch_sm(torch_A)
print(y)
y.backward(torch_A)
print(torch_A.grad)
print(torch_A.grad.numpy() - sm.backward(A))
# A @ dS_dZ
# print(help(np.diag))
# print(I)
# print(S)
# print(S.T @ S)
# print(A @ (S @ I - S.T @ S))
# print(S.T @ (I - S))

[[4.74258732e-02 9.52574127e-01]
 [2.68941421e-01 7.31058579e-01]
 [8.40859712e-50 1.00000000e+00]]
[[-1.35529979e-01  1.35529979e-01]
 [-1.96611933e-01  1.96611933e-01]
 [-9.50171475e-48  0.00000000e+00]]
[[-1.35529979e-01  1.35529979e-01]
 [-1.96611933e-01  1.96611933e-01]
 [-9.50171475e-48  0.00000000e+00]]

tensor([[4.7426e-02, 9.5257e-01],
        [2.6894e-01, 7.3106e-01],
        [8.4086e-50, 1.0000e+00]], dtype=torch.float64,
       grad_fn=<SoftmaxBackward>)
tensor([[-1.3553e-01,  1.3553e-01],
        [-1.9661e-01,  1.9661e-01],
        [-9.5017e-48,  0.0000e+00]], dtype=torch.float64)
[[ 0.00000000e+00  0.00000000e+00]
 [-2.77555756e-17  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00]]


In [499]:
class Model(object):
    """
    An abstract class that represents a machine learning model.
    """
    
    def __init__(self):
        pass
    
    def __call__(self, X, *args, **kwargs):
        return self.forward(X, *args, **kwargs)
    
    def forward(self, X, cache=False, *args, **kwargs):
        """
        Computes the model's output (forward pass)
        
        If cache is True, the input to each layer is stored so that it can be used later during
        backprop
        """
        pass
    
    def configure(self, loss, learning_rate, metrics=None):
        """
        Configure the model for training or evaluation
        """
        self.loss = loss
        self.learning_rate = learning_rate
        self.metrics = metrics
    
    def train_step(self, X, Y, learning_rate, *args, **kwargs):
        """
        Performs backpropagation through the model and updates the model's parameters using the
        training batch (X, Y).
        """
        pass
    
    def train(self, X, Y, epochs=10, verbose=True, *args, **kwargs):
        """
        Performs backpropagation through the model and updates the model's parameters using the
        training set for the given number of epochs (X, Y).
        """
        for epoch in range(epochs):
            self.train_step(X, Y, self.learning_rate, *args, **kwargs)
            
            if verbose:
                # compute the new predictions, the loss and specified metrics
                Y_pred = self.forward(X)
                J = self.loss(Y, Y_pred)
                s = f"Epoch {epoch+1:02}\tloss={J}"
                if self.metrics is not None:
                    metric_values = '\t'.join([f"{m.__name__}={m(Y, Y_pred)}" for m in self.metrics])
                    s += f" \t{metric_values}"
                print(s)

In [636]:
class Sequential(Model):
    def __init__(self, layers=None):
        self.layers = []
        
        if layers is not None:
            for layer in layers:
                self.add(layer)
    
    def add(self, layer):
        """
        Add a layer to the model
        """
        if len(self.layers) == 0:
            if layer.input_shape is None:
                raise Exception("The input shape for the first layer of the model must be "
                                "specified")
        else:
            layer.input_shape = self.layers[-1].output_shape
        layer.build()
        self.layers.append(layer)
    
    def forward(self, X, cache=False):
        Y_pred = X
        for layer in self.layers:
            Y_pred = layer(Y_pred, cache=cache)
            
        return Y_pred
    
    def train_step(self, X, Y, learning_rate=0.01):
        # forward pass
        Y_pred = self.forward(X, cache=True)
        
        # compute the gradient of the loss with respect to the Y_pred
        dJ_dY = self.loss.backward(Y, Y_pred)
    
        # dJ_dZ is a variable that holds the gradient of the loss with respect to
        # the current layer's outputs
        dJ_dZ = dJ_dY
        for l in range(len(self.layers)-1, -1, -1):
            layer = self.layers[l]
            
            # backpropagate through this layer and update its parameters
            grads = layer.backward(dJ_dZ)
            if isinstance(grads, tuple):
                dJ_dZ = grads[0]
                layer.update_parameters(*grads[1:], learning_rate)
            else:
                dJ_dZ = grads

In [410]:
class Loss(object):
    def __call__(self, X, *args, **kwargs):
        return self.forward(X, *args, **kwargs)
    
    def forward(self, Y_true, Y_pred, *args, **kwargs):
        pass
    
    def backward(self, Y_true, Y_pred, *args, **kwargs):
        """
        Computes the gradient of the loss with respect to Y_pred
        """
        pass

In [637]:
class MSE(Loss):
    def forward(self, Y_true, Y_pred):
        m = Y_true.shape[0]
        J = np.mean(np.square(Y_pred - Y_true))
        return J
    
    def backward(self, Y_true, Y_pred):
        m = Y_true.shape[0]
        dJ_dY_pred = 2 * (Y_pred - Y_true) / m
        return dJ_dY_pred

In [638]:
class BinaryCrossentropy(Loss):
    def forward(self, Y_true, Y_pred, epsilon=1e-12):
        Y_pred = np.clip(Y_pred, epsilon, 1. - epsilon)
        J = -np.mean(Y_true * np.log(Y_pred) + (1 - Y_true) * np.log(1 - Y_pred))
        return J
    
    def backward(self, Y_true, Y_pred, epsilon=1e-12):
        Y_pred = np.clip(Y_pred, epsilon, 1. - epsilon)
        m = Y_true.shape[0]
        dJ_dY_pred = ((1 - Y_true) / (1 - Y_pred) - Y_true / Y_pred) / m
        return dJ_dY_pred

In [639]:
class CategoricalCrossentropy(Loss):
    """
    Expects Y_true to be one-hot encoded and Y_pred to be normalized probabilities (e.g.
    the output of a Softmax layer)
    """
    def forward(self, Y_true, Y_pred, epsilon=1e-10):
#         Y_pred = np.clip(Y_pred, epsilon, 1. - epsilon) / np.sum(Y_pred, axis=-1, keepdims=True)
#         Y_true = np.clip(Y_true, epsilon, 1. - epsilon) / np.sum(Y_true, axis=-1, keepdims=True)
        m = Y_true.shape[0]
        J =  -np.sum(Y_true * np.log(Y_pred)) / m
        return J
    
    def backward(self, Y_true, Y_pred, epsilon=1e-10):
#         Y_pred = np.clip(Y_pred, epsilon, 1. - epsilon) / np.sum(Y_pred, axis=-1, keepdims=True)
#         Y_true = np.clip(Y_true, epsilon, 1. - epsilon) / np.sum(Y_true, axis=-1, keepdims=True)
        m = Y_true.shape[0]
        dJ_dY_pred = -(Y_true / Y_pred) / m
        return dJ_dY_pred

In [640]:
def categorical_accuracy(Y_true, Y_pred):
    """
    Expects Y_true to be one-hot encoded and Y_pred to be normalized probabilities (e.g.
    the output of a Softmax layer)
    """
    Y_true = np.argmax(Y_true, axis=-1)
    Y_pred = np.argmax(Y_pred, axis=-1)
    accuracy = np.mean((Y_true == Y_pred).astype(np.float32))
    return accuracy

In [641]:
def binary_accuracy(Y_true, Y_pred):
    """
    Expects Y_true to contain values of either 0 or 1 and Y_pred to be the probability
    of being 1
    """
    Y_pred = (Y_pred >= 0.5).astype(np.int)
    accuracy = np.mean((Y_true == Y_pred).astype(np.float32))
    return accuracy

In [3]:
import numpy as np
from models import Sequential
from layers import Linear, ReLU, Softmax
from losses import CategoricalCrossentropy
from metrics import binary_accuracy, categorical_accuracy

np.random.seed(0)

X = np.array([[-1], [1], [-4], [-6], [10], [60]])
print("X:")
print(X)
print()

Y = np.array([[1, 0], [0, 1], [1, 0], [1, 0], [0, 1], [0, 1]])
print("Y:")
print(Y)
print()

learning_rate = 0.01

my_model = Sequential()
my_model.add(Linear(5, input_shape=X.shape[1:]))
my_model.add(ReLU())
my_model.add(Linear(2))
my_model.add(Softmax())

my_model.configure(loss=CategoricalCrossentropy(),
                   learning_rate=learning_rate,
                   metrics=[categorical_accuracy])

Y_pred = my_model(X)
print(binary_accuracy(Y, Y_pred))
my_model.train(X, Y, epochs=2000)
# for _ in range(1000):
#     print("Y_pred:")
#     print(my_model(X))
#     print()
#     print("Loss:")
#     print(my_model.loss(Y, my_model(X)))
#     my_model.train_step(X, Y, learning_rate)
#     print(my_model.layers[0].weights.sum())

X:
[[-1]
 [ 1]
 [-4]
 [-6]
 [10]
 [60]]

Y:
[[1 0]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [0 1]]

0.75
Epoch 01	loss=0.36697983803618 	categorical_accuracy=1.0
Epoch 02	loss=0.3657081672915452 	categorical_accuracy=1.0
Epoch 03	loss=0.3644431640955684 	categorical_accuracy=1.0
Epoch 04	loss=0.3631847776055723 	categorical_accuracy=1.0
Epoch 05	loss=0.36193295777628437 	categorical_accuracy=1.0
Epoch 06	loss=0.3606876553349619 	categorical_accuracy=1.0
Epoch 07	loss=0.3594488217575316 	categorical_accuracy=1.0
Epoch 08	loss=0.3582164092456963 	categorical_accuracy=1.0
Epoch 09	loss=0.35699037070496437 	categorical_accuracy=1.0
Epoch 10	loss=0.35577065972356187 	categorical_accuracy=1.0
Epoch 11	loss=0.3545572305521871 	categorical_accuracy=1.0
Epoch 12	loss=0.3533500380845694 	categorical_accuracy=1.0
Epoch 13	loss=0.3521490378387977 	categorical_accuracy=1.0
Epoch 14	loss=0.35095418593938316 	categorical_accuracy=1.0
Epoch 15	loss=0.34976543910002506 	categorical_accuracy=1.0
Epoch 16	loss=0.348

Epoch 655	loss=0.08903279830150446 	categorical_accuracy=1.0
Epoch 656	loss=0.08891396717810575 	categorical_accuracy=1.0
Epoch 657	loss=0.0887954272681169 	categorical_accuracy=1.0
Epoch 658	loss=0.08867717756551073 	categorical_accuracy=1.0
Epoch 659	loss=0.08855921706865279 	categorical_accuracy=1.0
Epoch 660	loss=0.0884415447802786 	categorical_accuracy=1.0
Epoch 661	loss=0.08832415970747105 	categorical_accuracy=1.0
Epoch 662	loss=0.08820706086163806 	categorical_accuracy=1.0
Epoch 663	loss=0.08809024725848967 	categorical_accuracy=1.0
Epoch 664	loss=0.08797371791801654 	categorical_accuracy=1.0
Epoch 665	loss=0.08785747186446702 	categorical_accuracy=1.0
Epoch 666	loss=0.08774150812632564 	categorical_accuracy=1.0
Epoch 667	loss=0.0876258257362914 	categorical_accuracy=1.0
Epoch 668	loss=0.08751042373125524 	categorical_accuracy=1.0
Epoch 669	loss=0.08739530115227949 	categorical_accuracy=1.0
Epoch 670	loss=0.08728045704457575 	categorical_accuracy=1.0
Epoch 671	loss=0.0871658904

Epoch 1215	loss=0.049992820497427676 	categorical_accuracy=1.0
Epoch 1216	loss=0.04995252384674309 	categorical_accuracy=1.0
Epoch 1217	loss=0.04991228902835417 	categorical_accuracy=1.0
Epoch 1218	loss=0.04987211590465274 	categorical_accuracy=1.0
Epoch 1219	loss=0.04983200433842813 	categorical_accuracy=1.0
Epoch 1220	loss=0.049791954192865785 	categorical_accuracy=1.0
Epoch 1221	loss=0.049751965331545626 	categorical_accuracy=1.0
Epoch 1222	loss=0.04971203761844104 	categorical_accuracy=1.0
Epoch 1223	loss=0.04967217091791711 	categorical_accuracy=1.0
Epoch 1224	loss=0.049632365094729756 	categorical_accuracy=1.0
Epoch 1225	loss=0.04959262001402385 	categorical_accuracy=1.0
Epoch 1226	loss=0.04955293554133188 	categorical_accuracy=1.0
Epoch 1227	loss=0.049513311542572935 	categorical_accuracy=1.0
Epoch 1228	loss=0.04947374788405135 	categorical_accuracy=1.0
Epoch 1229	loss=0.04943424443245479 	categorical_accuracy=1.0
Epoch 1230	loss=0.04939480105485358 	categorical_accuracy=1.0
Epo

Epoch 1797	loss=0.03382950982226751 	categorical_accuracy=1.0
Epoch 1798	loss=0.03381046639371626 	categorical_accuracy=1.0
Epoch 1799	loss=0.03379144367474379 	categorical_accuracy=1.0
Epoch 1800	loss=0.033772441632328086 	categorical_accuracy=1.0
Epoch 1801	loss=0.033753460233515546 	categorical_accuracy=1.0
Epoch 1802	loss=0.033734499445421666 	categorical_accuracy=1.0
Epoch 1803	loss=0.0337155592352306 	categorical_accuracy=1.0
Epoch 1804	loss=0.03369663957019483 	categorical_accuracy=1.0
Epoch 1805	loss=0.03367774041763525 	categorical_accuracy=1.0
Epoch 1806	loss=0.033658861744940635 	categorical_accuracy=1.0
Epoch 1807	loss=0.03364000351956787 	categorical_accuracy=1.0
Epoch 1808	loss=0.0336211657090415 	categorical_accuracy=1.0
Epoch 1809	loss=0.03360234828095368 	categorical_accuracy=1.0
Epoch 1810	loss=0.03358355120296394 	categorical_accuracy=1.0
Epoch 1811	loss=0.03356477444279908 	categorical_accuracy=1.0
Epoch 1812	loss=0.03354601796825298 	categorical_accuracy=1.0
Epoch 

In [4]:
from sklearn.datasets import fetch_openml

# load the MNIST dataset
mnist_data = fetch_openml('mnist_784')

print(mnist_data['data'].shape)
print(mnist_data['target'].shape)

X = mnist_data['data']
y = mnist_data['target']

(70000, 784)
(70000,)


In [5]:
from sklearn.model_selection import train_test_split

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# normalize the inputs
X_train_normalized = X_train / 255.0
X_test_normalized = X_test / 255.0

def one_hot_encode(x, num_classes=10):
    x_one_hot = x.astype(int).reshape(-1)
    x_one_hot = np.eye(num_classes)[x_one_hot]
    return x_one_hot

# one-hot encode labels
y_train_one_hot = one_hot_encode(y_train)
y_test_one_hot = one_hot_encode(y_test)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [6]:
np.random.seed(42)

print(X_train_normalized.shape[1:])

my_model = Sequential()
my_model.add(Linear(16, input_shape=X_train_normalized.shape[1:]))
my_model.add(ReLU())
my_model.add(Linear(16))
my_model.add(ReLU())
my_model.add(Linear(10))
my_model.add(Softmax())

my_model.configure(loss=CategoricalCrossentropy(),
                   learning_rate=0.01,
                   metrics=[categorical_accuracy])

my_model.train(X_train_normalized, y_train_one_hot, epochs=100)
# for _ in range(10):
# #     print("Y_pred:")
# #     print(my_model(X))
# #     print()
#     my_model.train_step(X_train_normalized, y_train_one_hot, learning_rate)
#     print(my_model.layers[0].weights.sum())

(784,)
Epoch 01	loss=2.4092875533311156 	categorical_accuracy=0.10726666450500488
Epoch 02	loss=2.390993130398077 	categorical_accuracy=0.10846666991710663
Epoch 03	loss=2.3758679205466997 	categorical_accuracy=0.10935000330209732
Epoch 04	loss=2.362877525270739 	categorical_accuracy=0.11034999787807465
Epoch 05	loss=2.3514107151709025 	categorical_accuracy=0.11186666786670685
Epoch 06	loss=2.341046531221175 	categorical_accuracy=0.11326666921377182
Epoch 07	loss=2.3314749902622114 	categorical_accuracy=0.11506666988134384
Epoch 08	loss=2.3225261507166954 	categorical_accuracy=0.11676666885614395
Epoch 09	loss=2.3140755558727513 	categorical_accuracy=0.11798333376646042
Epoch 10	loss=2.3060105707126373 	categorical_accuracy=0.1194833368062973
Epoch 11	loss=2.2982774395482406 	categorical_accuracy=0.12133333086967468
Epoch 12	loss=2.290797233811268 	categorical_accuracy=0.12326666712760925
Epoch 13	loss=2.2835222463430815 	categorical_accuracy=0.12521666288375854
Epoch 14	loss=2.2764111

In [662]:
my_model.learning_rate = 0.05
my_model.train(X_train_normalized, y_train_one_hot, epochs=30)

Epoch 01	loss=0.3574528924574818 	categorical_accuracy=0.9006833434104919
Epoch 02	loss=0.3567416016156333 	categorical_accuracy=0.9009166955947876
Epoch 03	loss=0.3560742437816166 	categorical_accuracy=0.9011833071708679
Epoch 04	loss=0.35543117684375036 	categorical_accuracy=0.9013000130653381
Epoch 05	loss=0.35480363160541956 	categorical_accuracy=0.901533305644989
Epoch 06	loss=0.35418709259588027 	categorical_accuracy=0.9017833471298218
Epoch 07	loss=0.35357956373051297 	categorical_accuracy=0.9017999768257141
Epoch 08	loss=0.3529835997157553 	categorical_accuracy=0.9018166661262512
Epoch 09	loss=0.3523965667537949 	categorical_accuracy=0.9019666910171509
Epoch 10	loss=0.3518191398459576 	categorical_accuracy=0.9021166563034058
Epoch 11	loss=0.3512503755233175 	categorical_accuracy=0.902233362197876
Epoch 12	loss=0.3506890980257262 	categorical_accuracy=0.9023000001907349
Epoch 13	loss=0.350137404982088 	categorical_accuracy=0.9022833108901978
Epoch 14	loss=0.34959353947217314 	ca