# Naive Bayes Classifier

In [58]:
import csv
import random
import math
import numpy as np

In [59]:
def loadCsv(filename):
    # Numpy function to generate array from txt or csv
    return  np.genfromtxt(filename, delimiter = '\t')

In [60]:
def splitDataset(dataset, splitRatio):
    # Training set size
    trainSize = int(dataset.shape[0] * splitRatio)
    
    # List of randomly chosen indicies
    indices = np.random.permutation(dataset.shape[0])
    
    # Split indicies for training and test set by trainSize
    training_idx, test_idx = indices[:trainSize], indices[trainSize:]
    
    # Create training and test sets by indicies
    training, test = dataset[training_idx,:], dataset[test_idx,:]
    
    return training, test

In [61]:
def separateByClass(dataset):
    # Here we limit our classes to 0 and 1
    # You need to generalize this for arbitrary number of classes
    print("---------")
    classes = set(dataset[:, -1])
    dict = {}
    for classValue in classes:
        dict[classValue] = dataset[np.where(dataset[:, -1]==classValue), :]
    return dict

In [62]:
points = loadCsv('Skin_NonSkin.txt')
print(points)

dataset= points[np.where(points[:, -1] == 1), :]
dataset.shape
print(points)
print(dataset.mean(axis=1))
separateByClass(points)

[[  74.   85.  123.    1.]
 [  73.   84.  122.    1.]
 [  72.   83.  121.    1.]
 ..., 
 [ 163.  162.  112.    2.]
 [ 163.  162.  112.    2.]
 [ 255.  255.  255.    2.]]
[[  74.   85.  123.    1.]
 [  73.   84.  122.    1.]
 [  72.   83.  121.    1.]
 ..., 
 [ 163.  162.  112.    2.]
 [ 163.  162.  112.    2.]
 [ 255.  255.  255.    2.]]
[[ 113.86987554  146.60111288  203.9919385     1.        ]]
---------


{1.0: array([[[  74.,   85.,  123.,    1.],
         [  73.,   84.,  122.,    1.],
         [  72.,   83.,  121.,    1.],
         ..., 
         [  95.,  132.,  182.,    1.],
         [  92.,  132.,  181.,    1.],
         [  94.,  131.,  181.,    1.]]]),
 2.0: array([[[ 198.,  198.,  158.,    2.],
         [ 198.,  198.,  158.,    2.],
         [ 198.,  198.,  158.,    2.],
         ..., 
         [ 163.,  162.,  112.,    2.],
         [ 163.,  162.,  112.,    2.],
         [ 255.,  255.,  255.,    2.]]])}

In [63]:
def summarize(dataset):
    # Calculate means and standart deviations with one degree of freedom for each attribute
    # We do it by column which is axis 1
    # Also we remove last elements (guess why?)
    means = dataset.mean(axis=1)[0][:-1]
    stds = dataset.std(axis=1, ddof=1)[0][:-1]
    
    # Think what we do here?
    return means, stds

In [64]:
def summarizeByClass(dataset):
    # Divide dataset by class and summarize it
    separated = separateByClass(dataset)
    
    summaries = {}
    
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    
    return summaries

In [65]:
def calculateProbability(x, mean, stdev):
    # Calculate probability by x, mean and std
    # 1/(sqrt(2pi)*std)*exp(-(x-mean)^2/(2std^2))
    return np.prod((np.exp(-(x-mean)**2/(2*stdev**2)))/(np.sqrt(2*np.pi)*stdev))

In [66]:
def calculateClassProbabilities(summaries, inputVector):
    # Calculate probabilities for input vector from test set
    probabilities = {}
    
    for classValue, classSummaries in summaries.items():
        
        means = classSummaries[0]
        stds  = classSummaries[1]
        
        # Calculate corresonding probabilities and multiply them
        probabilities[classValue] = calculateProbability(inputVector[:-1], means, stds)
        
    return probabilities

In [67]:
def predict(summaries, inputVector):
    # Calculate probabilities
    probabilities = calculateClassProbabilities(summaries, inputVector)
    
    # Init values of probability and label
    bestLabel, bestProb = None, -1
    
    # Check probability of which class is better
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    
    return bestLabel

In [68]:
def getPredictions(summaries, testSet):
    # For each probability find optimal labels
    predictions = []
    
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)

    return predictions

In [69]:
def getAccuracy(testSet, predictions):
    # Check accuracy
    correct = 0
    
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [73]:
def normilize1(points):
    return (points[:, 0] - points.mean(axis=1))/points.std(axis=1)

In [74]:
def normilize2(points):
    return (points[:, 0] - points.min(axis=1))/(points.max(axis=1) - points.min(axis=1))

In [79]:
def main():
    # Set initial data
    filename = 'Skin_NonSkin.txt'
    
    # Set split ratio
    splitRatio = 0.67
    
    # Load dataset and return numpy array
    dataset = loadCsv(filename)
    
    #dataset = normilize2(dataset)

    
    # Split dataset
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    
    # Log row amounts
    print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
    
    # Prepare model
    summaries = summarizeByClass(trainingSet)
    
    # Test model
    predictions = getPredictions(summaries, testSet)
    
    accuracy = getAccuracy(testSet, predictions)
    
    print('Accuracy: {0}%'.format(accuracy))

In [80]:
main()

Split 245057 rows into train=164188 and test=80869 rows
---------
Accuracy: 92.59419555082911%
