 # Example of a bayesian classifier
 
 
 First is just reading data

In [1]:
from numpy.random import randn
from numpy import array, concatenate, zeros, ones
from sklearn.model_selection import train_test_split

data = []
labels = []
with open("wineX.txt") as data_file:
    for line in data_file.read().split("\n"):
        l = []
        for item in line.split(","):
            if item != "":
                l.append(float(item))
            
        if len(l) > 5:
            data.append(l)
        

with open("wineY.txt") as label_file:
    tmp = label_file.readlines()
    
    for label in tmp:
        labels.append(int(label[0]))
    
    
traind, testd, trainl, testl = train_test_split(data, labels, test_size=0.60)




## Single Bayesian Classifier

In [2]:
from naive_bayes import classifier
from time import time
from numpy import arange, array

traind = array(traind)
trainl = array(trainl)

testd = array(testd)
testl = array(testl)

c = classifier()

timestamp = time()
c.train(traind, trainl)
print("Time to train: {}".format(time() - timestamp))

timestamp = time()
correct = 0
for index, t in enumerate(testd):
    correct += (c.predict(t) == testl[index])

print("Correct rate: {}".format(correct / len(testd)))
print("Avarage predict rate: {}".format((time() - timestamp) / len(testd)))

Time to train: 0.0008151531219482422
Correct rate: 0.9626168224299065
Avarage predict rate: 9.776498669775847e-05


## Example of Adaboosting on Bayesian Classifier

Ada boost is a ensemble learning method.
Given X weak classifiers we can combine them into a strong classifier
by weigthing them to predict on features they're good at.

In [3]:
from numpy import array, ones, log
from time import time
from naive_bayes import classifier
from math import pow

traind = array(traind)
trainl = array(trainl)

testd = array(testd)
testl = array(testl)

# Uniformly distrubte weights on all test data to start with
weigths = ones(trainl.shape) / len(trainl)

# List of weights describing voting strenght for each classifier
alphas = []

# List of classifiers
classifiers = []

# Number of classifiers we want
X = 10

timestamp = time()
for ci in range(X):
    
    # Train a Classifier
    c = classifier().train(traind, trainl, weigths)
    
    """
    Calculate the error of the classifier on the training data.
    
    This can be done in multiple ways, one example is the
    sum of all the weights of the incorrectly classified samples.
    """ 
    error = 0
    
    # Remember what it classified correctly.
    classifications = []
    for i, t in enumerate(traind):
        prediction = c.predict(t) != trainl[i]
        error += weigths[i] * prediction
        classifications.append(prediction)
    
    """
    Apply some non-linearity to it to give it a low value
    if the error is low.
    """ 
    if error == 0:
        NLE = 1e-10
    else:
        NLE = error / (1 - error)
    
    """
    Update weights in such a way that the 
    classifier is encouraged to learn the features
    this classifier failed at. 
    
    Either increase the weights of the wrongly classified
    or reduce the weigths of the correctly classified.
    """
    
    
    uw = zeros(weigths.shape)
    # This reduces weigths of correctly classified
    for i, w in enumerate(weigths):
        uw[i] = w * pow(NLE, 1 - classifications[i])
    
    weigths = uw
    """
    The alpha should be inversly proportionate to the error.
    
    High if there is few errors, low otherwise.
    """ 
    # Log to normalize it
    alphas.append(log(1 / NLE))

    # Normalize weigths 
    weigths = weigths / sum(weigths)
    
    classifiers.append(c)

print("Time to train: {}".format(time() - timestamp))


Time to train: 0.07660698890686035


# Classification using ada Boost

    Classification is done by voting and can be implemented in
    a multitude of ways. The principle is to let each classifier
    cast its vote and choose whichever class gets the most votes.


In [4]:
timestamp = time()
correct = 0
for index, t in enumerate(testd):
    votes = {}
    for idx, classifier in enumerate(classifiers):
        vote = classifier.predict(t)
        
        if vote in votes:
            votes[vote] += alphas[idx]
        else:
            votes[vote] = 0
    
    mxv = 0
    key = None
    for k, v in votes.items():
        if v > mxv:
            mxv = v
            key = k
      
    correct += (key == testl[index])
    
print("Correct rate: {}".format(correct / len(testd)))
print("Avarage predict rate: {}".format((time() - timestamp) / len(testd)))
    
        
    

Correct rate: 0.9626168224299065
Avarage predict rate: 0.0010540864177953418
