# Kullback-Leibler divergence

In [20]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import binom

## Functions

In [2]:
def calculate_KL_divergence(P, Q):
    if len(P) != len(Q):
        return "ERROR: probability distributions P and Q are not equal in length."

    divergence = 0.0
    for x in range(len(P)):
        divergence += P[x] * np.log(P[x] / Q[x])
    return divergence

## Probability distributions

* Create distributions P (binomial) and Q (uniform)
* Both distributions have discrete values 0, 1, and 2
* Store values in a Pandas DataFrame

In [9]:
# Create binomial distribution
decimal_points = 3
N_trials = 10000
distribution_binom = binom.rvs(n=2, p=0.4, size=N_trials)

unique, counts = np.unique(distribution_binom, return_counts=True)
P = pd.DataFrame(np.round(counts / N_trials, decimal_points))

# Uniform distribution
Q = pd.DataFrame(np.round([1/3, 1/3, 1/3], decimal_points))

# Store in a DataFrame
df = pd.concat([P, Q], axis=1)
df.columns = ['P', 'Q']
df

Unnamed: 0,P,Q
0,0.357,0.333
1,0.479,0.333
2,0.164,0.333


In [19]:
# Using own implementation of KL Divergence
print(f"KL divergence (P||Q) = {round(calculate_KL_divergence(df.P, df.Q), 6)}")
print(f"KL divergence (Q||P) = {round(calculate_KL_divergence(df.Q, df.P), 6)}")

KL divergence (P||Q) = 0.082832
KL divergence (Q||P) = 0.091617


In [27]:
# NumPy version of KL divergence
print(f"KL divergence (P||Q) = {round(np.sum(sp.special.rel_entr(df.P, df.Q)), 6)}")
print(f"KL divergence (Q||P) = {round(np.sum(sp.special.rel_entr(df.Q, df.P)), 6)}")

KL divergence (P||Q) = 0.082832
KL divergence (Q||P) = 0.091617
