|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>KL (Kullback-Leibler) divergence<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Generate some data to obtain a distribution

In [None]:
# sample size
N = 10000

# log-normal data (and z-scored b/c we're interested in shape, not actual values)
data1 = np.exp( np.random.randn(N)*.5 )
data1 = (data1-data1.mean()) / np.std(data1)

# and a normal distribution
data2 = np.random.randn(N)
data2 = (data2-data2.mean()) / np.std(data2)



## their histograms (estimate of pdist)
# need bin edges to be the same
edges = np.linspace(min(min(data1),min(data2)),max(max(data1),max(data2)),80)

y1 = np.histogram(data1,edges)[0]
y2 = np.histogram(data2,edges)[0]

# convert to probability density estimate
y1 = y1 / y1.sum()
y2 = y2 / y2.sum()

In [None]:
## plot the distributions

# average edge bins
binCents = (edges[:-1]+edges[1:])/2

plt.figure(figsize=(8,4))

plt.plot(binCents,y1,linewidth=2,label='Data 1')
plt.plot(binCents,y2,linewidth=2,label='Data 2')

plt.legend()
plt.gca().set(xlabel='Value', ylabel='Probability', xlim=edges[[0,-1]])
plt.show()

# Divergence in numpy

In [None]:
# mask for valid probability values
mask = (y1>0) & (y2>0)

# y2 as the target
tmpkl = y1[mask] * np.log( y1[mask]/y2[mask] )
kldist_np_y1 = sum(tmpkl)

# y1 as the target
kldist_np_y2 = sum( y2[mask] * np.log(y2[mask]/y1[mask]) )


# the results!
print(f'KL distance using "y2" as the target = {kldist_np_y1}')
print(f'KL distance using "y1" as the target = {kldist_np_y2}')

# In pytorch

In [None]:
# pytorch expects log-probs
pt_y1 = torch.log(torch.tensor(y1))
pt_y2 = torch.log(torch.tensor(y2))

# calculate!
kldist_pt_y1 = F.kl_div(pt_y2[mask],pt_y1[mask],log_target=True,reduction='sum')
kldist_pt_y2 = F.kl_div(pt_y1[mask],pt_y2[mask],log_target=True,reduction='sum')

# the results!
print(f'KL distance using "y2" as the target = {kldist_pt_y1}')
print(f'KL distance using "y1" as the target = {kldist_pt_y2}')