# Regularization

## L1 and L2

* The first form of regularization are L1 and L2 and these are used to calculate the value `penalty` so that it can be used to calculate with the `loss` value to make the generalization error less
* L1's regularization penalty is the sum of all the absolute values of weights and bias. It is linear penalty as it is directly proportional to the weights and bias
* L2's regularization penalty is the sum of squared weights and bias. It is non linear and it is commonly used as it doesn't affect smaller valued parameters and larger valued parameters to grow even bigger
* We use $\lambda$ in the equation if it is bigger then the penalty will be bigger

$\text{Loss} = \text{data loss} + \text{L1 for weights} + \text{L1 for bias} + \text{L2 for weights} + \text{L2 for bias}$


In [6]:
import numpy as np
from tqdm import tqdm
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

In [3]:
class Layer_Dense:
    def __init__(self,n_inputs,n_neurons,
                 weight_regularizer_l1 = 0,weight_regularizer_l2=0,
                 bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):
        # init of weights and bias
        self.weights = 0.01 * np.random.randn(n_inputs,n_neurons)
        self.biases = np.zeros((1,n_neurons))
        # init the regularization
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
        
    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.dot(inputs,self.weights) + self.biases
        
    def backward(self,dvalues):
        # calculating gradients for parameters
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.dbiases = np.sum(dvalues,axis=0,keepdims=True)
        
        # calculating gradients for regularization
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases 
            
        # gradients on whole values
        self.dinputs = np.dot(dvalues,self.weights.T)

In [None]:
class Loss:
    def regularization_loss(self,layer):
        regularization_loss = 0
        # Weight Loss
        if layer.weight_regularizer_l1 > 0:
            # sum of absolute values of weights
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        if layer.weight_regularizer_l2 > 0:
            # sum of squared values of weights
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)
            
        return regularization_loss
    
    def calculate(self,output,y):
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossentropy(Loss):
    def forward(self,y_pred,y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred,1e-7,1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples),y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true,axis=1)
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self,dvalues,y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

In [5]:
class Activation_RELU:
    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
        
    def backward(self,dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0
        
class Activation_Softmax:
    def forward(self,inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs,axis=1,keepdims=True))
        self.output = exp_values / np.sum(exp_values,axis=1,keepdims=True)
        
    def backward(self,dvalues):
        self.dinputs = np.empty_like(dvalues)
        for index, (single_output,single_dvalues) in enumerate(zip(self.output,dvalues)):
            single_output = single_output.reshape(-1,1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output,single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix,single_dvalues) 
    
class Activation_Softmax_Loss_CategoricalCrossentropy:
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
    
    def forward(self,inputs,y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.calculate(self.output,y_true)
    
    def backward(self,dvalues,y_true):
        samples = len(dvalues)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true,axis=1)
        self.dinputs = dvalues.copy()
        self.dinputs[range(samples),y_true] -= 1
        self.dinputs = self.dinputs / samples
        
class Optimizer_SGD:
    def __init__(self,learning_rate=1.,decay=0.,momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    
    def update_params(self,layer):
        if self.momentum:
            if not hasattr(layer,'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            # past - future direction
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    def post_update_params(self):
        self.iterations += 1  

class Optimizer_Adagrad:
    def __init__(self,learning_rate=1.,decay=0.,epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
            
    def update_params(self,layer):
        if not hasattr(layer,'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / np.sqrt(layer.bias_cache) + self.epsilon
    
    def post_update_params(self):
        self.iterations += 1
        
class Optimizer_RMSprop:
    def __init__(self,learning_rate=0.001,decay=0.,epsilon=1e-7,rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
            
    def update_params(self,layer):
        if not hasattr(layer,'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + ( 1 - self.rho) * layer.dbiases ** 2
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
        
    def post_update_params(self):
        self.iterations += 1
        
class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
    
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    
    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
    def post_update_params(self):
        self.iterations += 1

In [10]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64,weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4)
activation1 = Activation_RELU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)
epoches = []
accuracies = []
losses = []
learning_rates = []

In [None]:
# Training loop
for epoch in tqdm(range(5001),desc="Training the model"):
    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    
    # Loss calculation
    data_loss = loss_activation.forward(dense2.output, y)
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)
    loss = data_loss + regularization_loss
    
    # Calculate accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    # Print progress every 100 epochs
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
        epoches.append(epoch)
        accuracies.append(accuracy)
        losses.append(loss)
        learning_rates.append(optimizer.current_learning_rate)
    
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

Training the model:   4%|▍         | 225/5001 [00:00<00:04, 1098.40it/s]

epoch: 0, acc: 0.343, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.757, loss: 0.724, lr: 0.04999752512250644
epoch: 200, acc: 0.800, loss: 0.574, lr: 0.04999502549496326


Training the model:  10%|█         | 504/5001 [00:00<00:03, 1296.48it/s]

epoch: 300, acc: 0.843, loss: 0.518, lr: 0.049992526117345455
epoch: 400, acc: 0.867, loss: 0.491, lr: 0.04999002698961558
epoch: 500, acc: 0.860, loss: 0.450, lr: 0.049987528111736124


Training the model:  16%|█▌        | 778/5001 [00:00<00:03, 1336.84it/s]

epoch: 600, acc: 0.870, loss: 0.438, lr: 0.049985029483669646
epoch: 700, acc: 0.880, loss: 0.418, lr: 0.049982531105378675
epoch: 800, acc: 0.843, loss: 0.437, lr: 0.04998003297682575


Training the model:  21%|██        | 1045/5001 [00:00<00:03, 1261.84it/s]

epoch: 900, acc: 0.887, loss: 0.395, lr: 0.049977535097973466
epoch: 1000, acc: 0.883, loss: 0.386, lr: 0.049975037468784345
epoch: 1100, acc: 0.890, loss: 0.378, lr: 0.049972540089220974


Training the model:  29%|██▉       | 1448/5001 [00:01<00:02, 1279.72it/s]

epoch: 1200, acc: 0.893, loss: 0.373, lr: 0.04997004295924593
epoch: 1300, acc: 0.877, loss: 0.400, lr: 0.04996754607882181
epoch: 1400, acc: 0.903, loss: 0.358, lr: 0.049965049447911185


Training the model:  34%|███▍      | 1711/5001 [00:01<00:02, 1294.46it/s]

epoch: 1500, acc: 0.907, loss: 0.352, lr: 0.04996255306647668
epoch: 1600, acc: 0.907, loss: 0.347, lr: 0.049960056934480884
epoch: 1700, acc: 0.907, loss: 0.340, lr: 0.04995756105188642


Training the model:  40%|███▉      | 1976/5001 [00:01<00:02, 1282.90it/s]

epoch: 1800, acc: 0.903, loss: 0.338, lr: 0.049955065418655915
epoch: 1900, acc: 0.907, loss: 0.330, lr: 0.04995257003475201
epoch: 2000, acc: 0.913, loss: 0.335, lr: 0.04995007490013731


Training the model:  45%|████▍     | 2235/5001 [00:01<00:02, 1267.48it/s]

epoch: 2100, acc: 0.890, loss: 0.359, lr: 0.0499475800147745
epoch: 2200, acc: 0.903, loss: 0.324, lr: 0.0499450853786262
epoch: 2300, acc: 0.920, loss: 0.316, lr: 0.0499425909916551


Training the model:  53%|█████▎    | 2642/5001 [00:02<00:01, 1280.80it/s]

epoch: 2400, acc: 0.913, loss: 0.312, lr: 0.04994009685382384
epoch: 2500, acc: 0.907, loss: 0.315, lr: 0.04993760296509512
epoch: 2600, acc: 0.883, loss: 0.372, lr: 0.049935109325431604


Training the model:  58%|█████▊    | 2912/5001 [00:02<00:01, 1310.21it/s]

epoch: 2700, acc: 0.910, loss: 0.315, lr: 0.049932615934796004
epoch: 2800, acc: 0.910, loss: 0.310, lr: 0.04993012279315098
epoch: 2900, acc: 0.910, loss: 0.307, lr: 0.049927629900459285


Training the model:  64%|██████▎   | 3181/5001 [00:02<00:01, 1312.21it/s]

epoch: 3000, acc: 0.910, loss: 0.304, lr: 0.049925137256683606
epoch: 3100, acc: 0.913, loss: 0.301, lr: 0.04992264486178666
epoch: 3200, acc: 0.913, loss: 0.298, lr: 0.04992015271573119


Training the model:  69%|██████▉   | 3456/5001 [00:02<00:01, 1315.82it/s]

epoch: 3300, acc: 0.917, loss: 0.295, lr: 0.04991766081847992
epoch: 3400, acc: 0.913, loss: 0.292, lr: 0.049915169169995596
epoch: 3500, acc: 0.917, loss: 0.289, lr: 0.049912677770240964


Training the model:  77%|███████▋  | 3867/5001 [00:02<00:00, 1339.81it/s]

epoch: 3600, acc: 0.917, loss: 0.287, lr: 0.049910186619178794
epoch: 3700, acc: 0.913, loss: 0.286, lr: 0.04990769571677183
epoch: 3800, acc: 0.910, loss: 0.297, lr: 0.04990520506298287


Training the model:  83%|████████▎ | 4144/5001 [00:03<00:00, 1362.22it/s]

epoch: 3900, acc: 0.913, loss: 0.295, lr: 0.04990271465777467
epoch: 4000, acc: 0.920, loss: 0.284, lr: 0.049900224501110035
epoch: 4100, acc: 0.913, loss: 0.279, lr: 0.04989773459295174


Training the model:  86%|████████▌ | 4281/5001 [00:03<00:00, 1323.66it/s]

epoch: 4200, acc: 0.917, loss: 0.275, lr: 0.04989524493326262
epoch: 4300, acc: 0.917, loss: 0.274, lr: 0.04989275552200545
epoch: 4400, acc: 0.917, loss: 0.271, lr: 0.04989026635914307


Training the model:  93%|█████████▎| 4668/5001 [00:03<00:00, 1146.36it/s]

epoch: 4500, acc: 0.920, loss: 0.269, lr: 0.04988777744463829
epoch: 4600, acc: 0.917, loss: 0.270, lr: 0.049885288778453954
epoch: 4700, acc: 0.923, loss: 0.265, lr: 0.049882800360552884


Training the model: 100%|██████████| 5001/5001 [00:03<00:00, 1262.33it/s]

epoch: 4800, acc: 0.917, loss: 0.263, lr: 0.04988031219089794
epoch: 4900, acc: 0.917, loss: 0.264, lr: 0.049877824269451976
epoch: 5000, acc: 0.920, loss: 0.281, lr: 0.04987533659617785





In [13]:
X_test, y_test = spiral_data(samples=100, classes=3)
dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output) 
loss = loss_activation.forward(dense2.output, y_test)
predictions = np.argmax(loss_activation.output, axis=1) 
if len(y_test.shape) == 2: 
    y_test = np.argmax(y_test, axis=1) 
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}') 

validation, acc: 0.817, loss: 0.532


## Dropout

In [15]:
class Layer_Dropout:
    def __init__(self,rate):
        self.rate = 1 - rate
        
    def forward(self,inputs):
        self.inputs = inputs
        self.binary_mask = np.random.binomial(1,self.rate,size=inputs.shape) / self.rate
        self.output = inputs * self.binary_mask
        
    def backward(self,dvalues):
        self.dinputs = dvalues * self.binary_mask

In [20]:
dense1 = Layer_Dense(2, 512,weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4)
activation1 = Activation_RELU()
dropout_1 = Layer_Dropout(0.1)
dense2 = Layer_Dense(512, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)

In [21]:
# Training loop
for epoch in tqdm(range(5001),desc="Training the model"):
    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dropout_1.forward(activation1.output)
    dense2.forward(dropout_1.output)
    
    # Loss calculation
    data_loss = loss_activation.forward(dense2.output, y)
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)
    loss = data_loss + regularization_loss
    
    # Calculate accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    # Print progress every 100 epochs
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
        epoches.append(epoch)
        accuracies.append(accuracy)
        losses.append(loss)
        learning_rates.append(optimizer.current_learning_rate)
    
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    dropout_1.backward(dense2.dinputs)
    activation1.backward(dropout_1.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

Training the model:   0%|          | 17/5001 [00:00<01:02, 80.05it/s]

epoch: 0, acc: 0.297, loss: 1.099, lr: 0.05


Training the model:   2%|▏         | 116/5001 [00:01<00:59, 82.56it/s]

epoch: 100, acc: 0.693, loss: 0.729, lr: 0.04975371909050202


Training the model:   4%|▍         | 215/5001 [00:02<00:56, 85.29it/s]

epoch: 200, acc: 0.753, loss: 0.638, lr: 0.049507401356502806


Training the model:   6%|▌         | 308/5001 [00:03<01:03, 74.17it/s]

epoch: 300, acc: 0.767, loss: 0.648, lr: 0.0492635105177595


Training the model:   8%|▊         | 411/5001 [00:05<01:02, 73.51it/s]

epoch: 400, acc: 0.780, loss: 0.604, lr: 0.04902201088288642


Training the model:  10%|█         | 511/5001 [00:06<00:54, 83.02it/s]

epoch: 500, acc: 0.790, loss: 0.594, lr: 0.048782867456949125


Training the model:  12%|█▏        | 610/5001 [00:07<01:02, 69.71it/s]

epoch: 600, acc: 0.830, loss: 0.544, lr: 0.04854604592455945


Training the model:  14%|█▍        | 720/5001 [00:09<00:49, 85.64it/s]

epoch: 700, acc: 0.833, loss: 0.577, lr: 0.048311512633460556


Training the model:  16%|█▋        | 816/5001 [00:10<00:51, 80.97it/s]

epoch: 800, acc: 0.780, loss: 0.628, lr: 0.04807923457858551


Training the model:  18%|█▊        | 916/5001 [00:11<00:48, 83.56it/s]

epoch: 900, acc: 0.787, loss: 0.565, lr: 0.04784917938657352


Training the model:  20%|██        | 1016/5001 [00:12<00:47, 83.80it/s]

epoch: 1000, acc: 0.823, loss: 0.539, lr: 0.04762131530072861


Training the model:  22%|██▏       | 1108/5001 [00:14<00:46, 83.88it/s]

epoch: 1100, acc: 0.827, loss: 0.533, lr: 0.04739561116640599


Training the model:  24%|██▍       | 1212/5001 [00:15<00:43, 86.73it/s]

epoch: 1200, acc: 0.843, loss: 0.502, lr: 0.04717203641681212


Training the model:  26%|██▌       | 1309/5001 [00:16<00:51, 71.97it/s]

epoch: 1300, acc: 0.837, loss: 0.540, lr: 0.04695056105920466


Training the model:  28%|██▊       | 1406/5001 [00:18<00:51, 69.31it/s]

epoch: 1400, acc: 0.850, loss: 0.523, lr: 0.04673115566147951


Training the model:  30%|███       | 1511/5001 [00:19<00:42, 82.55it/s]

epoch: 1500, acc: 0.830, loss: 0.518, lr: 0.046513791339132055


Training the model:  32%|███▏      | 1614/5001 [00:20<00:51, 66.40it/s]

epoch: 1600, acc: 0.830, loss: 0.500, lr: 0.04629843974258068


Training the model:  34%|███▍      | 1710/5001 [00:22<00:44, 73.86it/s]

epoch: 1700, acc: 0.823, loss: 0.523, lr: 0.046085073044840774


Training the model:  36%|███▌      | 1807/5001 [00:23<00:51, 61.48it/s]

epoch: 1800, acc: 0.837, loss: 0.488, lr: 0.04587366392953806


Training the model:  38%|███▊      | 1912/5001 [00:25<00:47, 64.44it/s]

epoch: 1900, acc: 0.853, loss: 0.498, lr: 0.04566418557925019


Training the model:  40%|████      | 2018/5001 [00:26<00:38, 76.69it/s]

epoch: 2000, acc: 0.780, loss: 0.579, lr: 0.045456611664166556


Training the model:  42%|████▏     | 2117/5001 [00:28<00:37, 76.30it/s]

epoch: 2100, acc: 0.847, loss: 0.519, lr: 0.045250916331055706


Training the model:  44%|████▍     | 2206/5001 [00:29<00:35, 77.74it/s]

epoch: 2200, acc: 0.853, loss: 0.525, lr: 0.0450470741925312


Training the model:  46%|████▋     | 2313/5001 [00:31<00:43, 62.19it/s]

epoch: 2300, acc: 0.797, loss: 0.664, lr: 0.04484506031660612


Training the model:  48%|████▊     | 2411/5001 [00:32<00:30, 85.89it/s]

epoch: 2400, acc: 0.830, loss: 0.569, lr: 0.04464485021652753


Training the model:  50%|█████     | 2519/5001 [00:33<00:28, 88.30it/s]

epoch: 2500, acc: 0.817, loss: 0.521, lr: 0.044446419840881816


Training the model:  52%|█████▏    | 2610/5001 [00:34<00:31, 76.32it/s]

epoch: 2600, acc: 0.860, loss: 0.517, lr: 0.04424974556396301


Training the model:  54%|█████▍    | 2717/5001 [00:36<00:26, 85.78it/s]

epoch: 2700, acc: 0.843, loss: 0.540, lr: 0.04405480417639544


Training the model:  56%|█████▌    | 2808/5001 [00:37<00:26, 81.35it/s]

epoch: 2800, acc: 0.853, loss: 0.505, lr: 0.04386157287600334


Training the model:  58%|█████▊    | 2911/5001 [00:38<00:25, 83.50it/s]

epoch: 2900, acc: 0.867, loss: 0.511, lr: 0.04367002925891961


Training the model:  60%|██████    | 3010/5001 [00:39<00:24, 81.42it/s]

epoch: 3000, acc: 0.833, loss: 0.535, lr: 0.043480151310926564


Training the model:  62%|██████▏   | 3111/5001 [00:41<00:21, 88.19it/s]

epoch: 3100, acc: 0.867, loss: 0.457, lr: 0.04329191739902161


Training the model:  64%|██████▍   | 3210/5001 [00:42<00:27, 64.10it/s]

epoch: 3200, acc: 0.807, loss: 0.599, lr: 0.043105306263201


Training the model:  66%|██████▋   | 3314/5001 [00:44<00:21, 78.42it/s]

epoch: 3300, acc: 0.840, loss: 0.485, lr: 0.0429202970084553


Training the model:  68%|██████▊   | 3412/5001 [00:45<00:22, 70.69it/s]

epoch: 3400, acc: 0.800, loss: 0.554, lr: 0.04273686909696996


Training the model:  70%|███████   | 3512/5001 [00:46<00:18, 80.95it/s]

epoch: 3500, acc: 0.827, loss: 0.506, lr: 0.04255500234052514


Training the model:  72%|███████▏  | 3612/5001 [00:48<00:17, 79.16it/s]

epoch: 3600, acc: 0.847, loss: 0.545, lr: 0.042374676893088686


Training the model:  74%|███████▍  | 3712/5001 [00:49<00:15, 83.85it/s]

epoch: 3700, acc: 0.840, loss: 0.461, lr: 0.042195873243596776


Training the model:  76%|███████▌  | 3811/5001 [00:50<00:14, 82.46it/s]

epoch: 3800, acc: 0.830, loss: 0.590, lr: 0.04201857220891634


Training the model:  78%|███████▊  | 3915/5001 [00:52<00:13, 78.24it/s]

epoch: 3900, acc: 0.843, loss: 0.537, lr: 0.041842754926984395


Training the model:  80%|████████  | 4010/5001 [00:53<00:16, 60.64it/s]

epoch: 4000, acc: 0.843, loss: 0.494, lr: 0.04166840285011875


Training the model:  82%|████████▏ | 4116/5001 [00:54<00:10, 84.28it/s]

epoch: 4100, acc: 0.807, loss: 0.541, lr: 0.041495497738495375


Training the model:  84%|████████▍ | 4215/5001 [00:55<00:09, 84.56it/s]

epoch: 4200, acc: 0.843, loss: 0.506, lr: 0.041324021653787346


Training the model:  86%|████████▌ | 4310/5001 [00:57<00:09, 70.15it/s]

epoch: 4300, acc: 0.823, loss: 0.550, lr: 0.041153956952961035


Training the model:  88%|████████▊ | 4413/5001 [00:58<00:07, 77.52it/s]

epoch: 4400, acc: 0.790, loss: 0.520, lr: 0.040985286282224684


Training the model:  90%|█████████ | 4515/5001 [00:59<00:06, 75.47it/s]

epoch: 4500, acc: 0.830, loss: 0.564, lr: 0.04081799257112535


Training the model:  92%|█████████▏| 4605/5001 [01:00<00:04, 84.43it/s]

epoch: 4600, acc: 0.840, loss: 0.598, lr: 0.04065205902678971


Training the model:  94%|█████████▍| 4713/5001 [01:02<00:03, 80.91it/s]

epoch: 4700, acc: 0.840, loss: 0.482, lr: 0.04048746912830479


Training the model:  96%|█████████▌| 4807/5001 [01:03<00:02, 74.16it/s]

epoch: 4800, acc: 0.863, loss: 0.436, lr: 0.04032420662123473


Training the model:  98%|█████████▊| 4914/5001 [01:05<00:01, 83.63it/s]

epoch: 4900, acc: 0.837, loss: 0.594, lr: 0.04016225551226957


Training the model: 100%|██████████| 5001/5001 [01:06<00:00, 75.50it/s]

epoch: 5000, acc: 0.837, loss: 0.511, lr: 0.04000160006400256





In [22]:
X_test, y_test = spiral_data(samples=100, classes=3)
dense1.forward(X_test)
activation1.forward(dense1.output)
dropout_1.forward(activation1.output)
dense2.forward(dropout_1.output) 
loss = loss_activation.forward(dense2.output, y_test)
predictions = np.argmax(loss_activation.output, axis=1) 
if len(y_test.shape) == 2: 
    y_test = np.argmax(y_test, axis=1) 
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}') 

validation, acc: 0.767, loss: 0.861
