In [1]:
import matplotlib as plt
from sklearn import datasets
import pandas as pd
import numpy as np

Iris data

In [2]:
iris = datasets.load_iris()

In [3]:
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [4]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
iris_nparr = np.asarray(iris_df)

Self-generated data

In [6]:
from sklearn.datasets import make_blobs
import matplotlib as plt
import scipy.stats
import math
pi = math.pi

In [7]:
x, y = make_blobs(n_samples = 100, centers = 2, n_features = 2, random_state = 1)
print(x[ : 3], y[ : 3])

[[-0.79415228  2.10495117]
 [-9.15155186 -4.81286449]
 [-3.10367371  3.90202401]] [0 1 0]


In [8]:
print(x.shape, y.shape)

(100, 2) (100,)


In [9]:
X = x[:5]
Y = y[:5]

print(X[1])

[-9.15155186 -4.81286449]


In [11]:
# fit probability distribution to a univariate data sample
def fit_distribution(data):
    # estimate parameters
    mu = np.mean(data)
    sigma = np.std(data)
    print(mu,sigma)
    # fit distribution
    dist = scipy.stats.norm(mu, sigma)
    return dist

In [12]:
# sort data into classes
Xy0 = x[y == 0]
Xy1 = x[y == 1]

In [13]:
# calculate priors
priory0 = len(Xy0)/len(x)
priory1 = len(Xy1)/len(x)
print(priory0, priory1)

0.5 0.5


In [14]:
# PDFs for y = 0, for each feature x0, x1
X0y0 = fit_distribution(Xy0[:,0])
X1y0 = fit_distribution(Xy0[:,1])

# PDFs for y = 1
X0y1 = fit_distribution(Xy1[:,0])
X1y1 = fit_distribution(Xy1[:,1])

-1.5632888906409914 0.787444265443213
4.426680361487157 0.958296071258367
-9.681177100524485 0.8943078901048118
-3.9713794295185845 0.9308177595208521


In [15]:
def probability(sample, prior, dist0, dist1):
    return(prior*dist0.pdf(sample[0])*dist1.pdf(sample[1]))

In [16]:
sample, label = x[4], y[4]

print(sample, label)

[-7.4693868  -4.20198333] 1


In [17]:
py0 = probability(sample, priory0, X0y0, X1y0)
print(py0)
py1 = probability(sample, priory1, X0y1, X1y1)
print(py1)

1.5929308729447612e-31
0.004354081017344896


In [18]:
print('P(y=0 | %s) = %.3f' % (sample, py0*100))
print('P(y=1 | %s) = %.3f' % (sample, py1*100))

P(y=0 | [-7.4693868  -4.20198333]) = 0.000
P(y=1 | [-7.4693868  -4.20198333]) = 0.435


Working with Iris data

In [19]:
iris_nparr[ : 3]

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ]])

In [20]:
Xy0 = iris_nparr[iris_nparr[:,-1] == 0]
Xy1 = iris_nparr[iris_nparr[:,-1] == 1]
Xy2 = iris_nparr[iris_nparr[:,-1] == 2]

In [21]:
priory0 = len(Xy0[:,-1])/len(iris_nparr)
priory1 = len(Xy1[:,-1])/len(iris_nparr)
priory2 = len(Xy2[:,-1])/len(iris_nparr)

print(priory0, priory1, priory2)

0.3333333333333333 0.3333333333333333 0.3333333333333333


In [22]:
def fit_distribution(data):
    mu = np.mean(data)
    sigma = np.std(data)
#     print(mu, sigma)
    
    dist = scipy.stats.norm(mu, sigma)
    return dist

In [23]:
Xy0_x0, Xy0_x1, Xy0_x2, Xy0_x3 = fit_distribution(Xy0[:, 0]), fit_distribution(Xy0[:, 1]), fit_distribution(Xy0[:, 2]), fit_distribution(Xy0[:, 3])
Xy1_x0, Xy1_x1, Xy1_x2, Xy1_x3 = fit_distribution(Xy1[:, 0]), fit_distribution(Xy1[:, 1]), fit_distribution(Xy1[:, 2]), fit_distribution(Xy1[:, 3])
Xy2_x0, Xy2_x1, Xy2_x2, Xy2_x3 = fit_distribution(Xy2[:, 0]), fit_distribution(Xy2[:, 1]), fit_distribution(Xy2[:, 2]), fit_distribution(Xy2[:, 3])

In [24]:
def probability(sample, prior, dist0, dist1, dist2, dist3):
    return(prior * dist0.pdf(sample[0]) * dist1.pdf(sample[1]) * dist2.pdf(sample[2]) * dist3.pdf(sample[3]))

In [25]:
sample = iris_nparr[34]
label = iris_nparr[34,-1]

print(sample, label)

[4.9 3.1 1.5 0.2 0. ] 0.0


In [26]:
py0 = probability(sample, priory0, Xy0_x0, Xy0_x1, Xy0_x2, Xy0_x3)
print(py0)
py1 = probability(sample, priory1, Xy1_x0, Xy1_x1, Xy1_x2, Xy1_x3)
print(py1)
py2 = probability(sample, priory2, Xy2_x0, Xy2_x1, Xy2_x2, Xy2_x3)
print(py2)

2.0746839772043923
6.324462919703278e-17
1.3119213728578478e-24


In [27]:
# This gives a 'score' as output, not probability as the quantity is not normalized, a simplification often performed when implementing naive bayes!
print('P(y=0 | %s) = %.3f' % (sample[:4], py0*100))
print('P(y=1 | %s) = %.3f' % (sample[:4], py1*100))
print('P(y=2 | %s) = %.3f' % (sample[:4], py2*100))

P(y=0 | [4.9 3.1 1.5 0.2]) = 207.468
P(y=1 | [4.9 3.1 1.5 0.2]) = 0.000
P(y=2 | [4.9 3.1 1.5 0.2]) = 0.000
