In [2]:
import numpy as np

def kl_divergence(p, q):
 return sum(p[i] * np.log(p[i]/q[i]) for i in range(len(p)))


p = [0.10, 0.40, 0.50]
q = [0.80, 0.15, 0.05]
kl_divergence(p, q)

1.3356800935337299

In [3]:
import numpy as np

# Defining the softmax function
def softmax(values):
    exp_values = np.exp(values)
    exp_values_sum = np.sum(exp_values)
    return exp_values/exp_values_sum

Softmax function is prone to two issues: overflow and underflow

Overflow: It occurs when very large numbers are approximated as infinity

Underflow: It occurs when very small numbers (near zero in the number line) are approximated (i.e. rounded to) as zero

To combat these issues when doing softmax computation, a common trick is to shift the input vector by subtracting the maximum element in it from all elements. For the input vector x, define z such that:
softmax(x) = softmax(x + c)

In [4]:
def stable_softmax(x):
    z = x - max(x)
    numerator = np.exp(z)
    denominator = np.sum(numerator)
    softmax = numerator/denominator

    return softmax

In [5]:
vec = np.array([1, 2, 3, 4, 5])
stable_softmax(vec)

array([0.01165623, 0.03168492, 0.08612854, 0.23412166, 0.63640865])

In [6]:
import torch.nn as nn
import torch

# Cross Entropy function.
def cross_entropy(y_pred, y_true):

    y_pred = softmax(y_pred)
    loss = 0

    for i in range(len(y_pred)):
        loss = loss + (-1 * y_true[i]*np.log(y_pred[i]))

    return loss

y_true = [1, 0, 0, 0, 0]
y_pred = [10, 5, 3, 1, 4]
cross_entropy(y_pred, y_true)

0.010199795719758164