# Naive Bayes

In [1]:
import numpy as np
import pandas as pd

In [2]:
#opening the file that contains the data set and removing the missing values
file = open("breast-cancer.data", "r")
file_data = file.readlines()
data = []
for line in file_data:
    if '?' in line:    #Excluding the missing values in the data set
        continue
    data.append(line.strip("\n").split(','))

In [3]:
data = np.array(data)
np.random.shuffle(data)
train_data = data[:221]
test_data = data[221:]

In [4]:
data[0]

array(['recurrence-events', '60-69', 'ge40', '40-44', '3-5', 'yes', '3',
       'right', 'left_low', 'no'], dtype='<U20')

In [5]:
def prior_prob(labels):
    '''computes the prior probabilities of the classes'''
    
    unique, count = np.unique(labels, return_counts = True)    #The unique class values and their number of occurences
    sum_of_counts = np.sum(count)       #The sum of all the counts, used to normalise the probabilities
    probs = {}    #The probabilities
    for i in range(len(unique)):
        probs[unique[i]] = count[i]/sum_of_counts
        
    return probs

In [6]:
def class_conditionals(data):
    '''computes the conditional probabilities of seeing the data points under the classes'''
    no_rec_events = []      #contains data points that belongs to the no recurrence events class
    rec_events = []         #contains data points that belongs to the recurrence events class
        
    for i in range(len(data)):     #puts every data point in the correct array
        if data[i, 0] == "no-recurrence-events":
            no_rec_events.append(data[i, 1:])
        else:
            rec_events.append(data[i, 1:])
    
    no_rec_events = np.array(no_rec_events)
    no_rec = []
    for i in range(len(no_rec_events[0])):
        unique, counts = np.unique(no_rec_events[:, i], return_counts=True)
        tmp = {}    #contains the unique values of a column and their probabilities
        for j in range(len(unique)):
            tmp[unique[j]] = counts[j]
        no_rec.append(tmp)
     
    rec_events = np.array(rec_events)
    rec = []
    for i in range(len(rec_events[0])):
        unique, counts = np.unique(rec_events[:, i], return_counts=True)
        tmp = {}   #contains the unique values of a column and their probabilities
        for j in range(len(unique)):
            tmp[unique[j]] = counts[j]
        rec.append(tmp)
    
    return no_rec, len(no_rec_events), rec, len(rec_events)

### To move from non smoothing to smoothing, simply remove the comment sign and the zero in the if statement marked regularization

In [12]:
def infer(rec, rec_count, no_rec, no_rec_count, prior, data_point):
    '''Determines the class which the data point belongs to'''
    rec_product = 1     #the class conditional probability using the independent rule
    no_rec_product = 1
    for i in range(len(data_point)):
        if (data_point[i]) in rec[i]:
            rec_product *= (rec[i][data_point[i]])/rec_count
        if not (data_point[i]) in rec[i]:    #regularization 
            rec_product *= (1)/(rec_count+len(rec[i])+1)
        if (data_point[i]) in no_rec[i]:
            no_rec_product *= (no_rec[i][data_point[i]])/no_rec_count
        if not (data_point[i]) in no_rec[i]:   #regularization
            no_rec_product *= (1)/(no_rec_count+len(no_rec[i])+1)
    
    rec_class=(rec_product*prior["recurrence-events"])/(rec_product*prior["recurrence-events"]+no_rec_product*prior["no-recurrence-events"])
    no_rec_class=(no_rec_product*prior["no-recurrence-events"])/(rec_product*prior["recurrence-events"]+no_rec_product*prior["no-recurrence-events"])
    
    if rec_class > no_rec_class:
        return "recurrence-events"
    else:
        return "no-recurrence-events"

In [8]:
no_rec,no_rec_count, rec, rec_count = class_conditionals(train_data)
prior = prior_prob(train_data[:, 0])

In [13]:
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(len(test_data)):
    out = infer(rec, rec_count, no_rec, no_rec_count, prior, test_data[i, 1:])
    if test_data[i, 0] == out and out=="recurrence-events":
        TP += 1
    elif test_data[i, 0] == out and out=="no-recurrence-events":
        TN += 1
    elif out=="recurrence-events":
        FP += 1
    elif out=="no-recurrence-events":
        FN
accuracy = 'accuracy is: '+str(round(((TP+TN)/(TP+TN+FP+FN))*100))+'%'
d = {'approxiamted reccurence-events': [TP,FP], 'approximated no-reccurence-events': [FN,TN]}
df = pd.DataFrame(data=d,index=['actual reccurence-events','actual no-reccurence-events'])

In [14]:
accuracy

'accuracy is: 86%'

In [15]:
df

Unnamed: 0,approxiamted reccurence-events,approximated no-reccurence-events
actual reccurence-events,9,0
actual no-reccurence-events,7,35
