In [None]:
"""
Softmax and cross-entropy are two of the most common functions used in neural networks so we should
know how they work
"""

"""
-- Softmax --

Squashes the ouptut to be between 0 and 1 so we get probabilities that add up to 1
Allows predictions to be made (one with highest probability chosen)
"""

In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
# Softmax in numpy

def softmax(x):
    # Below is the formula
    return np.exp(x) / np.sum(np.exp(x), axis=0) # Tell it the dimension on which to calculate

x = np.array([2.0, 1.0, 0.1])
outputs = softmax(x)
print("Softmax numpy: ", outputs ) # Softmax numpy:  [0.65900114 0.24243297 0.09856589]

Softmax numpy:  [0.65900114 0.24243297 0.09856589]


In [3]:
# Softmax in PyTorch

x = torch.tensor([2.0, 1.0, 0.1])
outputs = torch.softmax(x, dim=0) # Still tell it the dimension

print("Softmax torch: ", outputs ) # Softmax torch:  tensor([0.6590, 0.2424, 0.0986])

Softmax torch:  tensor([0.6590, 0.2424, 0.0986])


In [None]:
"""
Softmax is often combined with the soft-entropy loss
Cross-entropy  is a type of loss that measures the performance of our classification model whose 
output is a probability between 0 and 1, it can be used in multi-class problems and the loss increases
as the predicted probability diverges from the actual label

The better our prediciton, the lower our losss
    > y = real, y_hat = prediction probabilities
    > y has to be one-hot encoded meaning one 1 and the rest are 0 (only one correct output)
    > y_hat are the probabilities
    > y = [1,0,0], y_hat = [.7, .2, .1] has cross-entropy 0.35 which is good
    > y = [1,0,0], y_hat = [.1, .3, .6] has cross-entropy of 2.30 which is bad
"""

In [5]:
# Cross-entropy loss in numpy

def cross_entropy(actual, predicted):
    # The following is the cross-entropy loss formula
    loss = -np.sum(actual * np.log(predicted))
    return loss # / float(predicted.shape[0]) To normalise it

"""
y must be one hot encoded
If class = 0 [1 0 0]
If class = 1 [0 1 0]
If class = 2 [0 0 1]
"""

Y = np.array([1, 0, 0])

# y_pred (y_hat) has probabilities

Y_pred_good = np.array([.7, .2, .1])
Y_pred_bad = np.array([.1, .3, .6])

l1 = cross_entropy(Y, Y_pred_good)
l2 = cross_entropy(Y, Y_pred_bad)

print(f'Loss1 numpy: {l1:.4f}') # 0.3567
print(f'Loss2 numpy: {l2:.4f}') # 2.3026

Loss1 numpy: 0.3567
Loss2 numpy: 2.3026


In [8]:
# Cross-entropy in PyTorch

"""
Careful!
nn.CrossEntropyLoss() applies nn.LogSoftmax + nn.NLLLoss (negative likelihood loss)
    > No Softmax in last layer!
    > Y has class labels, not One-Hot!
    > Y_pred (y_hat) has raw scores (logits), no Softmax
        - NOTE : Logit = probability which, instead of between 0 and 1, is between -inf and inf
            > It is the inverse to the sigmoid function, limits values between 0 and one on Y-axis
              rather than X-axis
"""

loss = nn.CrossEntropyLoss()

Y = torch.tensor([0]) # Class 0 is the correct class, equivalent to [1 0 0] in the above example

# Y_pred has size n_samples * n_classes
# ... in this case, 1 x 3
Y_pred_good = torch.tensor([[2.0, 1.0, 0.1]])
Y_pred_bad = torch.tensor([[0.5, 2.0, 0.3]])

l1 = loss(Y_pred_good, Y)
l2 = loss(Y_pred_bad, Y)

print(f'Loss1 torch: {l1.item():.4f}') # 0.4170
print(f'Loss2 torch: {l2.item():.4f}') # 1.8406

# To get the actual predictions

_, predictions1 = torch.max(Y_pred_good, 1)
_, predictions2 = torch.max(Y_pred_bad, 1)

print(f'prediciton 1: {predictions1}') # tensor[0]
print(f'prediciton 2: {predictions2}') # tensor[1]

# Meaning that index 0 is the highest for the first one, index 2 is the hihest for the second one
# ... which is the case


Loss1 torch: 0.4170
Loss2 torch: 1.8406
prediciton 1: tensor([0])
prediciton 2: tensor([1])


In [9]:
# The loss in PyTorch allows for multiple samples
# We can increase samples to 3
# Listen to like 2:35 for what this means, it makes sense

loss = nn.CrossEntropyLoss()

# 3 samples
Y = torch.tensor([2, 0, 1]) # Requires 3 class labels

# Y_pred has size n_samples * n_classes
# ... in this case, 3 x 3
Y_pred_good = torch.tensor([[0.1, 1.0, 2.1], [2.0, 1.0, 0.1], [0.1, 3.0, 0.1]])
Y_pred_bad = torch.tensor([[2.1, 1.0, 0.1], [0.1, 1.0, 2.1], [0.1, 3.0, 0.1]])

l1 = loss(Y_pred_good, Y)
l2 = loss(Y_pred_bad, Y)

print(f'Loss1 torch: {l1.item():.4f}')
print(f'Loss2 torch: {l2.item():.4f}')

# To get the actual predictions

_, predictions1 = torch.max(Y_pred_good, 1)
_, predictions2 = torch.max(Y_pred_bad, 1)

print(f'prediciton 1: {predictions1}') # tensor[0]
print(f'prediciton 2: {predictions2}') # tensor[1]

Loss1 torch: 0.3018
Loss2 torch: 1.6242
prediciton 1: tensor([2, 0, 1])
prediciton 2: tensor([0, 2, 1])


In [None]:
"""
For multiclass problems ("which animal is the pictur?") in PyTorch, use nn.CrossEntropyLoss()
    > No Softmax at the end!
For binary problesm ("Is the picture a dog?") in PyTorch, use nn.BCELoss()
    > Sigomid function at the end
        - > .5 = yes
        - < .5 = no
"""