In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

from scipy.stats import norm

## 两种描述连续随机变量的概率分布的函数：
概率密度函数PDF（Probability Density Function）：描述某个确定的取值点附近的**可能性**的函数
累积分布函数CDF (Cumulative Distribution Functions): 是PDF的（从负无穷-oo到当前值的）积分

概率密度函数是概率分布函数的导函数。
通过对PDF求积分，就可以得到CDF，实际上就是求概率密度函数曲线，从负无穷-oo到当前值对应的取样点的线下面积。
那么通过计算某个取样点的左右两个非常临近的点的CDF的差，就能“计算”在该取样点的实际概率。

In [None]:
X_step = 0.0002
def norm_prob(x, mu, sigma):
    p = norm(mu,sigma).cdf(x+X_step/2) - norm(mu,sigma).cdf(x-X_step/2)
    return p

def loglikelihood(data, mu, sigma):
    l = 0.0
    for x in data:
        l -= np.log(norm_prob(x, mu, sigma))
    return l

生成随机数列

In [None]:
N=1000
mu, sigma = 1.6, 0.2

data = norm.rvs(loc=mu, scale=sigma, size = N)

In [None]:
# Create the bins and histogram
count, bins, ignored = plt.hist(data, 20)

# Plot the distribution curve
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
    np.exp( - (bins - mu)**2 / (2 * sigma**2) ), linewidth=3, color='y')
plt.show()



In [None]:
def hist_dist(pd):
    f, ax_hist = plt.subplots(1, sharex=True)
    _, FD_bins = np.histogram(pd, bins="fd")
    bin_nr = min(len(FD_bins)-1, 50)
    sns.histplot(data=pd, ax=ax_hist, bins=bin_nr, stat="density", alpha=0.4, kde=True, kde_kws={"cut": 3})

In [None]:
hist_dist(data)

In [None]:
min_v, max_v = min(data), max(data)
step = (max_v - min_v)/N

In [None]:
# pdf用于计算概率密度，这个值可能是大于1的。
h=1.8 # 某个具体样本值
norm.pdf(x=h, loc=mu, scale=sigma)

In [None]:
AUC = 0
for n in range (N):
    x = min_v + step*n
    y = norm_prob(x, mu, sigma)
    print(f"x= {x}, y = {y}")
    AUC += y

print(AUC)


In [None]:
norm_prob(h, mu, sigma)

In [None]:
loglikelihood(data, mu, sigma)

In [None]:
plt.hist(data)

In [None]:
mus = [1.4,1.5,1.6,1.7,1.8,1.9,2.0]
sigma =0.1
mus

In [None]:
l = [loglikelihood(data,mu2,sigma) for mu2 in mus]
l

In [None]:
import pandas as pd
df = pd.DataFrame()
df['mu'] = mus
df['-logl'] =l
df                                     

In [None]:
plt.figure(figsize=(6,4))

sns.pointplot(df['mu'],df['-logl'], alpha=0.8)