[Chapter 3] Distance Metrics

In [547]:
import numpy as np
import pandas as pd

from scipy.stats import multivariate_normal
import scipy.stats as ss
from sklearn.metrics import mutual_info_score

import matplotlib.pyplot as plt

generate random variables

In [548]:
np.random.seed(0)

# generate linearly correlated random variables
x = np.random.randn(1000)
y = 0.5 * x + 0.1 * np.random.randn(1000)  # add some noise
bins = 10

In [549]:
# discretized into regular grid with a number of partition (bins) per dimension

# joint counts/distribution
cXY = np.histogram2d(x, y, bins)[0] 
print(np.sum(cXY))
cXY

1000.0


array([[  8.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  4.,  16.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   9.,  56.,   5.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,  18., 110.,  18.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,  31., 162.,  24.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,  34., 171.,  34.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   1.,  32., 112.,  15.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,  16.,  50.,  20.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   7.,  22.,   9.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   2.,  13.]])

marginal entropy

In [550]:
# entropy of X: the expected value of surprises, the amount of uncertainty associated with X

# hX = -sum{p(x)*log(p(x))} 
# log(p(x)) measures hows surprising as surprising are characterized by low prob
hX = ss.entropy(np.histogram(x, bins)[0])
print(hX)
# bins = 1, hX = 0 (zero entropy when all prob is concerntrated in a single element)
# bins = 10, hX = 2.301 (entropy closer to max when X is distributed more uniformly)

# check with alternative
cX = np.histogram(x, bins)[0] # count
pX = cX/np.sum(cX)            # prob
hX_alt = -np.sum(pX * np.log(pX + 1e-10))
print(hX_alt)

1.9528011589708985
1.9528011579708984


In [551]:
# entropy of Y
hY = ss.entropy(np.histogram(y, bins)[0])
print(hY)

1.9943498307707825


mutual information

In [552]:
# decrease in uncertainty (or information gain) in X that results from knowing the value Y

# iXY = hX - hX_Y = hX + hY - hXY
# iXX = hX
# iXY = 0 (when X are Y are independent) x = 0
# iXY <= min{hX, hY}

iXY = mutual_info_score(None, None, contingency=cXY) # when y = 0*x + 1*e, iXY = 0.048 -> 0
print(iXY)
# normalized mutual information
iXYn = iXY/min(hX, hY)
print(iXYn)

1.2222238074758958
0.6258823648588945


joint entropy

In [553]:
hXY = hX + hY - iXY
hXY

2.7249271822657857

conditional entropy

In [554]:
# the conditional entropy of X given Y
hX_Y = hXY - hY
# the conditoinal entropy of Y given X
hY_X = hXY - hX

hX_Y, hY_X

(0.7305773514950031, 0.7721260232948872)

variation of information

In [555]:
# vXY = hXY - iXY = 2*hXY - hX - hY = hX + hY - 2*iXY = hX_Y + hY_X
# the uncertainty we expect in one variable if we are told the value of the other

def varInfo(x, y, bins, norm=False):
    """
    variation of information
    """
    cXY = np.histogram2d(x, y, bins)[0]
    iXY = mutual_info_score(None, None, contingency=cXY)
    hX = ss.entropy(np.histogram(x, bins)[0])
    hY = ss.entropy(np.histogram(y, bins)[0])
    vXY = hX + hY - 2*iXY
    if norm:
        hXY = hX + hY -iXY  # joint entropy
        vXY /= hXY # normalized variation information
    return vXY

vXY = varInfo(x, y, bins)
print(vXY)
# check
print(hXY-iXY, hX_Y + hY_X)

1.5027033747898897
1.50270337478989 1.5027033747898904


on discretized continuous random variables

In [None]:
def numBins(nObs, corr=None):
    """
    optimal number of bins for discretization
    """
