In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from scipy.stats import norm

In [None]:
X, y = make_blobs(n_samples=1000, n_features=2, centers=2, random_state=1)

In [None]:
print(X[:5])
print(y[:5])

[[ -3.05837272   4.48825769]
 [ -8.60973869  -3.72714879]
 [  1.37129721   5.23107449]
 [ -9.33917563  -2.9544469 ]
 [-11.57178593  -3.85275513]]
[0 1 0 1 1]


In [None]:
class GaussianNaiveBayes:
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def splitData(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                        test_size = 0.3,
                                                        random_state =1)
    return X_train, X_test, y_train, y_test

  def distribution(self, data):
    mean = np.mean(data)
    std = np.std(data)
    dist = norm(mean, std)
    return dist

  def probability(self, x, prior, dist1, dist2):
    return prior * dist1.pdf(x[0]) * dist2.pdf(x[1])

  def fit(self):
    self.X_train, self.X_test, self.y_train, self.y_test = self.splitData()
    # seperate data by class
    X0_train = self.X_train[self.y_train == 0]
    X1_train = self.X_train[self.y_train == 1]

    self.prior0 = len(X0_train)/len(self.X_train)
    self.prior1 = len(X1_train)/len(self.X_train)

    self.dist_X00 = self.distribution(X0_train[0]) # Dim 0 class 0
    self.dist_X10 = self.distribution(X0_train[1]) # Dim 1 class 0

    self.dist_X01 = self.distribution(X1_train[0]) # Dim 0 class 1
    self.dist_X11 = self.distribution(X1_train[1]) # Dim 1 class 1

  def predict(self):
    for sample, target in zip(self.X_test, self.y_test):  # iterate over 2 list with same indices
      py0 = self.probability(sample, self.prior0, self.dist_X00, self.dist_X10)
      py1 = self.probability(sample, self.prior1, self.dist_X01, self.dist_X11)

      print("P(y=0|%s) = %.3f" % (sample, py0 * 100))
      print("P(y=1|%s) = %.3f" % (sample, py1 * 100))
      print("The model predicted class {} and the actual class is {}".
            format(np.argmax([py0, py1]), target))

In [None]:
clf = GaussianNaiveBayes(X, y)

In [None]:
clf.fit()

In [None]:
clf.predict()

P(y=0|[0.03424921 4.60335965]) = 0.596
P(y=1|[0.03424921 4.60335965]) = 0.000
The model predicted class 0 and the actual class is 0
P(y=0|[-1.93540597  5.63544545]) = 0.324
P(y=1|[-1.93540597  5.63544545]) = 0.000
The model predicted class 0 and the actual class is 0
P(y=0|[-1.53940095  5.02369298]) = 0.441
P(y=1|[-1.53940095  5.02369298]) = 0.000
The model predicted class 0 and the actual class is 0
P(y=0|[-9.57470929 -4.08759711]) = 0.001
P(y=1|[-9.57470929 -4.08759711]) = 0.244
The model predicted class 1 and the actual class is 1
P(y=0|[-10.57107323  -4.07059622]) = 0.000
P(y=1|[-10.57107323  -4.07059622]) = 0.151
The model predicted class 1 and the actual class is 1
P(y=0|[0.8687658  4.15785509]) = 0.679
P(y=1|[0.8687658  4.15785509]) = 0.000
The model predicted class 0 and the actual class is 0
P(y=0|[-2.18191637  4.03723397]) = 0.511
P(y=1|[-2.18191637  4.03723397]) = 0.001
The model predicted class 0 and the actual class is 0
P(y=0|[-8.36577665 -4.35513738]) = 0.001
P(y=1|[-8.3

KeyboardInterrupt: ignored

In [33]:
df = pd.read_excel("Dry_Bean_Dataset.xlsx")
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [34]:
df.corr()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
Area,1.0,0.966722,0.931834,0.951602,0.241735,0.267481,0.999939,0.984968,0.054345,-0.196585,-0.35753,-0.268067,-0.847958,-0.639291,-0.272145,-0.355721
Perimeter,0.966722,1.0,0.977338,0.913179,0.385276,0.391066,0.967689,0.99138,-0.02116,-0.30397,-0.547647,-0.406857,-0.864623,-0.767592,-0.408435,-0.42931
MajorAxisLength,0.931834,0.977338,1.0,0.826052,0.550335,0.541972,0.932607,0.961733,-0.078062,-0.284302,-0.596358,-0.568377,-0.773609,-0.859238,-0.568185,-0.482527
MinorAxisLength,0.951602,0.913179,0.826052,1.0,-0.009161,0.019574,0.951339,0.948539,0.145957,-0.155831,-0.210344,-0.015066,-0.947204,-0.471347,-0.019326,-0.263749
AspectRation,0.241735,0.385276,0.550335,-0.009161,1.0,0.924293,0.243301,0.303647,-0.370184,-0.267754,-0.766979,-0.987687,0.024593,-0.837841,-0.978592,-0.449264
Eccentricity,0.267481,0.391066,0.541972,0.019574,0.924293,1.0,0.269255,0.318667,-0.319362,-0.297592,-0.722272,-0.970313,0.01992,-0.860141,-0.981058,-0.449354
ConvexArea,0.999939,0.967689,0.932607,0.951339,0.243301,0.269255,1.0,0.985226,0.052564,-0.206191,-0.362083,-0.269922,-0.84795,-0.640862,-0.274024,-0.362049
EquivDiameter,0.984968,0.99138,0.961733,0.948539,0.303647,0.318667,0.985226,1.0,0.028383,-0.231648,-0.435945,-0.32765,-0.892741,-0.713069,-0.330389,-0.392512
Extent,0.054345,-0.02116,-0.078062,0.145957,-0.370184,-0.319362,0.052564,0.028383,1.0,0.191389,0.344411,0.354212,-0.141616,0.237956,0.347624,0.148502
Solidity,-0.196585,-0.30397,-0.284302,-0.155831,-0.267754,-0.297592,-0.206191,-0.231648,0.191389,1.0,0.60715,0.303766,0.153388,0.343559,0.307662,0.702163


In [35]:
df = df.drop(columns = ['ConvexArea', 'EquivDiameter', 'ShapeFactor3', 'Perimeter', 'MinorAxisLength', 'AspectRation', 'MajorAxisLength'])

In [36]:
df

Unnamed: 0,Area,Eccentricity,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor4,Class
0,28395,0.549812,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.998724,SEKER
1,28734,0.411785,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.998430,SEKER
2,29380,0.562727,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.999066,SEKER
3,30008,0.498616,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.994199,SEKER
4,30140,0.333680,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.999166,SEKER
...,...,...,...,...,...,...,...,...,...,...
13606,42097,0.765002,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.998385,DERMASON
13607,42101,0.735702,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.998219,DERMASON
13608,42139,0.734065,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.996767,DERMASON
13609,42147,0.741055,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.995222,DERMASON


In [37]:
df.corr()

Unnamed: 0,Area,Eccentricity,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor4
Area,1.0,0.267481,0.054345,-0.196585,-0.35753,-0.268067,-0.847958,-0.639291,-0.355721
Eccentricity,0.267481,1.0,-0.319362,-0.297592,-0.722272,-0.970313,0.01992,-0.860141,-0.449354
Extent,0.054345,-0.319362,1.0,0.191389,0.344411,0.354212,-0.141616,0.237956,0.148502
Solidity,-0.196585,-0.297592,0.191389,1.0,0.60715,0.303766,0.153388,0.343559,0.702163
roundness,-0.35753,-0.722272,0.344411,0.60715,1.0,0.768086,0.230273,0.782824,0.472149
Compactness,-0.268067,-0.970313,0.354212,0.303766,0.768086,1.0,-0.009394,0.868939,0.484436
ShapeFactor1,-0.847958,0.01992,-0.141616,0.153388,0.230273,-0.009394,1.0,0.469197,0.248619
ShapeFactor2,-0.639291,-0.860141,0.237956,0.343559,0.782824,0.868939,0.469197,1.0,0.529932
ShapeFactor4,-0.355721,-0.449354,0.148502,0.702163,0.472149,0.484436,0.248619,0.529932,1.0


In [38]:
def centralize(X):
    return (X - X.mean)

In [84]:
X = df.iloc[:,:-1]
X

Unnamed: 0,Area,Eccentricity,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor4
0,28395,0.549812,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.998724
1,28734,0.411785,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.998430
2,29380,0.562727,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.999066
3,30008,0.498616,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.994199
4,30140,0.333680,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.999166
...,...,...,...,...,...,...,...,...,...
13606,42097,0.765002,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.998385
13607,42101,0.735702,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.998219
13608,42139,0.734065,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.996767
13609,42147,0.741055,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.995222


In [105]:
t = np.mean(X.values, axis = 1)

In [108]:
def distribution(data):
    mean = np.mean(data)
    std = np.std(data)
    dist = norm(mean, std)
    return dist

In [109]:
distribution(X.values)

<scipy.stats._distn_infrastructure.rv_frozen at 0x956fcaa2c0>