# Chapter 4 - Activation Functions

Why we use activation functions?
"We use activation functions because if
the activation function itself is nonlinear, it allows for neural networks with usually two or more
hidden layers to map nonlinear functions.", extracted from the original book.

# ReLU activation function class

In [None]:
!pip install nnfs

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Installing collected packages: nnfs
Successfully installed nnfs-0.5.1


In [None]:
import numpy as np
from nnfs.datasets import spiral_data
import nnfs

In [None]:
# ReLU activation class
class Activation_ReLU:

  # Forward pass
  def forward(self, inputs):
    # Calculate output values from input
    self.output = np.maximum(0, inputs)

In [None]:
# Softmax activation class
class Activation_Softmax:

  # Forward pass
  def forward(self, inputs):
    
    # Get unnormalized probabilities
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

    # Normalize them for each sample
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

    self.output = probabilities

# As we explained in the previous chapter, there are dead neurons, but there also are exploding neurons. These neurons outputs very large values. Very large values can make the network useless over the training time. Besides, in an exponential function, the bigger the input, the bigger the output. 

In [None]:
# examples of exploding neurons and very large values

# here we see that bigger the input, bigger the output
print(np.exp(1))
print(np.exp(10))
print(np.exp(100))
#print(np.exp(1000)) # returns infinite, overflow error

# we know that exp(-inf) tends to 0, and exp(0) is 1
print(np.exp(-np.inf), np.exp(0))

# we can use this trick to subtract the max value to each value. First, we subtract
# the max value to each value in output vector. Then, we have some negative values
# and the max value that now is 0. Applying softmax we eliminate the negative sign
# because the exp values are negative and the sum in the denominator is also negative.
# Then, the final division is always positive. 

# THE SUBTRACTION HELPS TO REDUCE THE MAGNITUD OF THE NEURON OUTPUT

2.718281828459045
22026.465794806718
2.6881171418161356e+43
0.0 1.0


In [None]:
class Layer_Dense:

  def __init__(self, n_inputs, n_neurons):
    """
    Initialise weights and biases
    random.randn returns a matrix with n_inputs x n_neurons shape
    it makes sense to create the weights matrix with inputs x neurons shape
    NUMBER OF WEIGHTS VALUES = NUMBER OF INPUTS X NUMBER OF NEURONS
    
    note that we define the shape as inputs X neurons and not neurons X inputs
    to avoid transposing every time we do a forward pass
    
        4 inputs, 3 neurons
        wij -> weight, i -> input value, j -> neuron 
        [w11, w12, w13]
        [w21, w22, w23]
        [w31, w32, w33]
        [w41, w42, w43]

        each column is already a vector of weights of each neuron for the input
    
    random.randn generates values from a Gaussian distribution with a mean of 0
    and a variance of 1, which mean that it'll generate random numbers, positive and
    negative [-1,1], centered at 0 and with the mean value close to 0.
    
    we multiply here by 0.01 because we want to initialise weights with non-zero values
    but these values have to be small because training updates will be smaller. 
    If weight values are very big, the training will last more time.
    
    """
    
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    # ONE BIAS VALUE FOR EACH NEURON
    self.biases = np.zeros((1, n_neurons))


  # Forward pass
  def forward(self, inputs):
    # Calculate outputs values from inputs, weights and biases
    self.output = np.dot(inputs, self.weights) + self.biases

    

In [None]:
nnfs.init()
# create dataset
X, y = spiral_data(samples=100, classes=3)  # samples per class

# -----------------model------------

# create dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create ReLU activation (to beb used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values
dense2 = Layer_Dense(3, 3)

# Create softmax activation (to be used with dense layer)
activation2 = Activation_Softmax()

# ----------------forward------------

# Perform a forward pass of our training data through this layer
dense1.forward(X)

# Forward pass through activation func.
# Takes in output from previous layer
activation1.forward(dense1.output)

# Make a forward pass through second dense layer
# it takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# in softmax, the number of inputs is the number of final classes
# here we have 3 classes
activation2.forward(dense2.output)


print(activation2.output[:5])


[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]


Sum, axis and keepdims

In [None]:
# example
layer_outputs = np.array([[4.8, 1.21, 2.385],
                          [8.9, -1.81, 0.2],
                          [1.41, 1.051, 0.026]])

# same value in both operations because default axis is None
# here, as we do not indicate any axis, sums element by element
print(np.sum(layer_outputs))
print(np.sum(layer_outputs, axis=None))

# with axis=0, here we sum [4.8, 1.21, 2.385] + [8.9, 1.81, 0.2] + [1.41, 1.051, 0.026] = 
# = [15.11, 0.451, 2.611]
print(np.sum(layer_outputs, axis=0))

# with axis=1, we sum the columns inside of each row
print(np.sum(layer_outputs, axis=1))
print(np.sum(layer_outputs, axis=1).shape)

# later we need to divide the exp vector by the sum, then we need to keep 
# the layer ouputs shape
print(np.sum(layer_outputs, axis=1, keepdims=True))
print(np.sum(layer_outputs, axis=1, keepdims=True).shape)

18.172
18.172
[15.11   0.451  2.611]
[8.395 7.29  2.487]
(3,)
[[8.395]
 [7.29 ]
 [2.487]]
(3, 1)
