In [53]:
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer as KBD
from sklearn.model_selection import train_test_split as TTS
from sklearn.base import BaseEstimator as BE, ClassifierMixin as CM
from collections import defaultdict, Counter
from sklearn.naive_bayes import GaussianNB as GNB
import pandas as pd

In [2]:
#1.
#repo UVI | dataset: wine

wineRaw = np.genfromtxt("wine.data",delimiter=',')

X = wineRaw[:,1:] #args 178x13
y = wineRaw[:,0] #class 178X1

In [3]:
#2.
NBINS =3
est = KBD(n_bins=NBINS , encode='ordinal', strategy='kmeans')
Xt =est.fit_transform(X)
print(X)
print(Xt)

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]
[[2. 0. 1. ... 1. 2. 1.]
 [1. 0. 1. ... 1. 2. 1.]
 [1. 1. 2. ... 1. 2. 2.]
 ...
 [1. 2. 1. ... 0. 0. 1.]
 [1. 1. 1. ... 0. 0. 1.]
 [2. 2. 2. ... 0. 0. 0.]]


In [4]:
#3.
X_train, X_test, y_train, y_test = TTS( Xt, y, test_size=0.3, random_state=42)

In [5]:
#4.
class NBC_discrete(BE, CM):
    def __init__(self,laPlace_switch:bool =False, nBuckets:int =3):
        self.laPlace = laPlace_switch
        self.nBuckets = nBuckets
        
    def __stackSeparators(self):
        for key in self.separator.keys():
            self.separator[key]=np.vstack( self.separator[key] ) 
    def __separateByClass(self): 
        self.separator=defaultdict(list)
        for v, k in zip(self.X,self.y):
            self.separator[int(k)].append(np.array(v,dtype=int))
        self.__stackSeparators()

    def __aPrioriClass(self): # rozkład a priori klas P(Y =y)
        self.aPriori ={}
        for key in self.separator.keys():
            self.aPriori[key] = len(self.separator[key])/self.size[0]
            
    def __sizeOfClasses(self):
        self.sizeOfSeparator ={}
        for key in self.separator.keys():
            self.sizeOfSeparator[key] = len(self.separator[key])
      
    def __fillValues(self, counter):
        for attribute in range(0, self.nBuckets):
            if attribute not in counter.keys():
                counter[attribute] = 0.0
        
    def __conditionalDistribution(self): 
        self.__numOfAtributes = int(self.size[1])
        minY = int(min(self.y))
        maxY = int(max(self.y)+1)
        self.conditionalDistDict = [list(range(0,self.__numOfAtributes))for x in range(minY,maxY) ]
        for key in self.separator.keys():
            dictKey = key - 1
            for attribute in range(0, self.__numOfAtributes):
                c = Counter(self.separator[key][:,attribute])
                self.__fillValues(c)
                self.conditionalDistDict[dictKey][attribute] ={}
                for value in c.keys():
                    if not self.laPlace: # zastosowanie przełącznika poprawki LaPlace'a
                        self.conditionalDistDict[dictKey][attribute][value] = c[value]/self.sizeOfSeparator[key]
                    else:
                        self.conditionalDistDict[dictKey][attribute][value] =  (c[value]+1)/(self.sizeOfSeparator[key]+ len(c.keys()))
    
    def __setProbabilities(self):
        self.__separateByClass()
        self.__aPrioriClass()
        self.__sizeOfClasses()
        self.__conditionalDistribution()
        
    def __calculateKeyLikelihood(self, X_row):
        yLikelihood = np.zeros(len(self.separator.keys()))
        for key in self.separator.keys():
            keyProbability =1
            dictKey = key -1
            for attribute,value in enumerate(X_row):
                tempProb = self.conditionalDistDict[dictKey][attribute][value] 
                keyProbability *=tempProb
            yLikelihood[dictKey]=keyProbability*self.aPriori[key]
        return yLikelihood

    def fit(self,X,y):
        self.size = np.shape(X)
        self.X =X
        self.y = y
        self.__setProbabilities()
        
    def predict(self,X):
        yPredicted = []
        for X_row in X:
            yLikelihood = self.__calculateKeyLikelihood( X_row)
            yPredicted.append(np.argmax(yLikelihood, axis=0) +1) # +1 bo zakres etykiet 1-3 
        return np.transpose(yPredicted)
    
    def predict_proba(self,X):
        yPredicted = []
        for X_row in X:
            yLikelihood = self.__calculateKeyLikelihood(X_row)
            arg = np.argmax(yLikelihood, axis=0)
            yProbability = yLikelihood[arg]/ np.sum(yLikelihood)
            yPredicted.append(yProbability)
        return np.transpose(yPredicted)

In [6]:
def accuracy_score(y, yPredicted):
    return (np.sum(y ==yPredicted)/np.shape(y)[0])*100


In [7]:
def printResults(yPredicted, yPredictedProbability, y, stringSet:str= None, LaPlaceEnabled:bool = False):
    stringLaPlace = "Enabled" if LaPlaceEnabled else "Disabled"
    #print(yPredicted)
    #print(y)
    print("--------------------------------------------------------------------------------------")
    print("{} set ".format(stringSet)) #Train | Test
    print("LaPlace's correction {}".format(stringLaPlace))
    print("Pobabilities of predicted y:")
    print(yPredictedProbability)
    print("Accuracy of {} set: {}%\n\n".format(stringSet, str(accuracy_score(y, yPredicted))) )

In [8]:
# eksperyment bez poprawki LaPlace'a
LaPlaceEnabled = False

discNBC = NBC_discrete(LaPlaceEnabled,NBINS)
discNBC.fit(X_train, y_train)


#-------------------------------------------TEST SET-------------------------------------------
yPredicted = discNBC.predict(X_test)
yPredictedProbability = discNBC.predict_proba(X_test)
printResults(yPredicted, yPredictedProbability, y_test, stringSet= "Test", LaPlaceEnabled= LaPlaceEnabled)
#----------------------------------------TRAIN SET----------------------------------------------
yPredicted = discNBC.predict(X_train)
yPredictedProbability = discNBC.predict_proba(X_train)
printResults(yPredicted, yPredictedProbability, y_train, stringSet= "Train", LaPlaceEnabled= LaPlaceEnabled)

--------------------------------------------------------------------------------------
Test set 
LaPlace's correction Disabled
Pobabilities of predicted y:
[0.99999238 0.99972734 1.         1.         1.         1.
 1.         0.99999519 1.         1.         0.97564852        nan
 0.98273277 1.         1.         1.         1.         1.
 0.9999657  1.         1.         0.71854861 1.         1.
 1.         1.         1.         1.         1.         1.
 0.99927018 1.         0.99956488 1.         1.         0.99999878
        nan 0.99991524 1.                nan 0.82688209 1.
 0.99709185 0.71821064 0.999892   1.         1.         1.
 1.         1.         1.         0.99999736 1.         1.        ]
Accuracy of Test set: 88.88888888888889%


--------------------------------------------------------------------------------------
Train set 
LaPlace's correction Disabled
Pobabilities of predicted y:
[0.99999454 1.         1.         0.99987244 1.         0.99975443
 0.99997998 1.       

  yProbability = yLikelihood[arg]/ np.sum(yLikelihood)


In [9]:
# eksperyment z poprawką LaPlace'a

LaPlaceEnabled = True
discNBC = NBC_discrete(LaPlaceEnabled,NBINS)
discNBC.fit(X_train, y_train)


#-------------------------------------------TEST SET-------------------------------------------
yPredicted = discNBC.predict(X_test)
yPredictedProbability = discNBC.predict_proba(X_test)
printResults(yPredicted, yPredictedProbability, y_test, stringSet= "Test", LaPlaceEnabled= LaPlaceEnabled)
#----------------------------------------TRAIN SET----------------------------------------------
yPredicted = discNBC.predict(X_train)
yPredictedProbability = discNBC.predict_proba(X_train)
printResults(yPredicted, yPredictedProbability, y_train, stringSet= "Train", LaPlaceEnabled= LaPlaceEnabled)

--------------------------------------------------------------------------------------
Test set 
LaPlace's correction Enabled
Pobabilities of predicted y:
[0.99998346 0.99946744 0.99783193 0.99997557 0.99983459 0.99998135
 0.99999807 0.99998757 0.99864373 0.99568188 0.96468826 0.99865066
 0.97749187 0.90810973 0.99999543 0.99997406 0.9999761  0.99999772
 0.99991895 0.99997146 0.99999038 0.84293532 0.94381794 0.99999585
 0.99999589 0.99999187 0.99998266 0.98964081 0.99999694 0.99997791
 0.99856294 0.99997855 0.99934842 0.99998036 0.99993589 0.99999644
 0.9921068  0.99981739 0.96062072 0.99629711 0.80593894 0.99999438
 0.99365824 0.4849364  0.99974731 0.99987317 0.99997713 0.99963595
 0.99996661 0.99997074 0.99824706 0.9999927  0.99979755 0.9999709 ]
Accuracy of Test set: 96.29629629629629%


--------------------------------------------------------------------------------------
Train set 
LaPlace's correction Enabled
Pobabilities of predicted y:
[0.99998404 0.99999325 0.99912668 0.999750

In [44]:

#############################################################################################################
#-------------------------------------------continuous NBC---------------------------------------------------
#wersja bezpieczna numerycznie 
#4.
class NBC_continuous(BE, CM):


        
    def __stackSeparators(self):
        for key in self.separator.keys():
            self.separator[key]=np.vstack( self.separator[key] ) 
    def __separateByClass(self): 
        self.separator=defaultdict(list)
        for v, k in zip(self.X,self.y):
            self.separator[int(k)].append(np.array(v))
        self.__stackSeparators()
        
    def __aPrioriClass(self): # rozkład a priori klas P(Y =y)
        self.aPriori ={}
        for key in self.separator.keys():
            self.aPriori[key] = len(self.separator[key])/self.size[0]
            
    def __calculateAvg(self):
        self.__numOfAtributes = int(self.size[1])
        self.__minY = int(min(self.y))
        self.__maxY = int(max(self.y)+1)
        self.avgDict = [list(range(0,self.__numOfAtributes))for x in range(self.__minY,self.__maxY) ]
        for key in self.separator.keys():
            dictKey = key - 1
            for attribute in range(0, self.__numOfAtributes):
                #3print(self.separator[key][:,attribute])
                self.avgDict[dictKey][attribute] = np.mean(self.separator[key][:,attribute],axis =0)
                #print(self.avgDict[dictKey][attribute])
            
    def __calculateStd(self):

        self.stdDict = [list(range(0,self.__numOfAtributes))for x in range(self.__minY,self.__maxY) ]
        for key in self.separator.keys():
            dictKey = key - 1
            for attribute in range(0, self.__numOfAtributes):
                self.stdDict[dictKey][attribute] = np.std(self.separator[key][:,attribute],axis =0, ddof = 1)
      #      print(self.stdDict[dictKey])
        

    def __setAvgStd(self):
        self.__separateByClass()
        self.__aPrioriClass()
        self.__calculateAvg()
        self.__calculateStd()
        
    def __calculateKeyLikelihood(self, X_row):
        yLikelihood = np.zeros(len(self.separator.keys()))
        for key in self.separator.keys():
            keyProbability =0
            dictKey = key -1
            for attribute,value in enumerate(X_row):
                tempDiv =  (value - self.avgDict[dictKey][attribute])**2/(2* self.stdDict[dictKey][attribute]**2)
                tempLog = -np.log(self.stdDict[dictKey][attribute])
                tempProb = tempLog - tempDiv
                keyProbability +=tempProb
            yLikelihood[dictKey]=keyProbability+np.log(self.aPriori[key])
        return yLikelihood

    def fit(self,X,y):
        self.size = np.shape(X)
        self.X =X
        self.y = y
        self.__setAvgStd()
        
    def predict(self,X):
        yPredicted = []
        for X_row in X:
            yLikelihood = self.__calculateKeyLikelihood( X_row)
            yPredicted.append(np.argmax(yLikelihood, axis=0) +1) # +1 bo zakres etykiet 1-3 
        return np.transpose(yPredicted)
    
    def predict_proba(self,X):
        yPredicted = []
        for X_row in X:
            yLikelihood = self.__calculateKeyLikelihood(X_row)
            arg = np.argmax(yLikelihood, axis=0)
            yProbability = yLikelihood[arg]/ np.sum(yLikelihood)
            yPredicted.append(yProbability)
        return np.transpose(yPredicted)

In [37]:
X_train, X_test, y_train, y_test = TTS( X, y, test_size=0.3, random_state=42)

In [49]:
def printResults2(yPredicted, yPredictedProbability, y, stringSet:str= None):

    print("--------------------------------------------------------------------------------------")
    print("{} set ".format(stringSet)) #Train | Test
    print("Pobabilities of predicted y:")
    print(yPredictedProbability)
    print(y)
    print(yPredicted)
    print("Accuracy of {} set: {}%\n\n".format(stringSet, str(accuracy_score(y, yPredicted))) )

In [51]:


countNBC = NBC_continuous()
countNBC.fit(X_train, y_train)


#-------------------------------------------TEST SET-------------------------------------------
yPredicted = countNBC.predict(X_test)
yPredictedProbability = countNBC.predict_proba(X_test)
printResults2(yPredicted, yPredictedProbability, y_test, stringSet= "Test")
#----------------------------------------TRAIN SET----------------------------------------------
yPredicted = countNBC.predict(X_train)
yPredictedProbability = countNBC.predict_proba(X_train)
printResults2(yPredicted, yPredictedProbability, y_train, stringSet= "Train")

--------------------------------------------------------------------------------------
Test set 
Pobabilities of predicted y:
[0.04283157 0.0824521  0.1000743  0.03989753 0.0890889  0.02913399
 0.09690261 0.00508274 0.07287265 0.13190378 0.05609093 0.10622755
 0.11607671 0.15755595 0.03446339 0.08605365 0.08672935 0.08409101
 0.02231542 0.06999089 0.05239642 0.10950214 0.15368021 0.08041195
 0.04109986 0.01359296 0.07173194 0.10685188 0.09611647 0.02885655
 0.03083698 0.0745417  0.05036207 0.02357521 0.02204026 0.03440896
 0.20434186 0.06678657 0.08651896 0.14557153 0.0846591  0.07014358
 0.06951436 0.1994218  0.08972745 0.02691068 0.0771411  0.11334629
 0.05601735 0.02785052 0.1272068  0.01803826 0.02521796 0.05567291]
[1. 1. 3. 1. 2. 1. 2. 3. 2. 3. 1. 3. 1. 2. 1. 2. 2. 2. 1. 2. 1. 2. 2. 3.
 3. 3. 2. 2. 2. 1. 1. 2. 3. 1. 1. 1. 3. 3. 2. 3. 1. 2. 2. 2. 3. 1. 2. 2.
 3. 1. 2. 1. 1. 3.]
[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1 3
 3 2 3 1 2 2 2 3 1 2 2 3 1 2 

In [57]:
#Gaussian NBC
countNBC = GNB()
countNBC.fit(X_train, y_train)


#-------------------------------------------TEST SET-------------------------------------------
yPredicted = countNBC.predict(X_test)
yPredictedProbability = countNBC.predict_proba(X_test)
printResults2(yPredicted, yPredictedProbability, y_test, stringSet= "Test")
#----------------------------------------TRAIN SET----------------------------------------------
yPredicted = countNBC.predict(X_train)
yPredictedProbability = countNBC.predict_proba(X_train)
printResults2(yPredicted, yPredictedProbability, y_train, stringSet= "Train")

--------------------------------------------------------------------------------------
Test set 
Pobabilities of predicted y:
[[9.99994880e-01 5.11985627e-06 6.42760330e-33]
 [9.99999563e-01 4.37161281e-07 1.89052628e-26]
 [8.30191002e-18 1.79036458e-03 9.98209635e-01]
 [1.00000000e+00 4.99340759e-10 3.17066405e-41]
 [6.56464948e-07 9.99999344e-01 7.04895557e-23]
 [1.00000000e+00 1.34177096e-12 1.39939533e-35]
 [2.00966282e-10 1.00000000e+00 1.65899118e-14]
 [3.46879621e-20 2.84799447e-12 1.00000000e+00]
 [1.48349034e-04 9.99851651e-01 5.78491291e-34]
 [6.28418391e-15 3.28566984e-04 9.99671433e-01]
 [9.94593314e-01 5.40668636e-03 5.78156665e-34]
 [4.19230151e-18 1.37204597e-12 1.00000000e+00]
 [9.91430689e-01 8.56931112e-03 8.10476270e-21]
 [2.86352527e-15 9.75916343e-01 2.40836573e-02]
 [1.00000000e+00 1.64055127e-13 5.07621055e-36]
 [2.26900821e-07 9.99999773e-01 4.93022081e-16]
 [9.57901494e-11 1.00000000e+00 1.51234803e-13]
 [5.74326465e-12 1.00000000e+00 9.41060938e-15]
 [1.000000