In [1]:
class NaiveBayesClassifier:

    def __init__(self, X, y):

        '''
        X and y denotes the features and the target labels respectively
        '''
        self.X, self.y = X, y 

        self.N = len(self.X) # Length of the training set

        self.dim = len(self.X[0]) # Dimension of the vector of features

        self.attrs = [[] for _ in range(self.dim)] # Here we'll store the columns of the training set

        self.output_dom = {} # Output classes with the number of ocurrences in the training set. In this case we have only 2 classes

        self.data = [] # To store every row [Xi, yi]


        for i in range(len(self.X)):
            for j in range(self.dim):
                # if we have never seen this value for this attr before, 
                # then we add it to the attrs array in the corresponding position
                if not self.X[i][j] in self.attrs[j]:
                    self.attrs[j].append(self.X[i][j])

            # if we have never seen this output class before,
            # then we add it to the output_dom and count one occurrence for now
            if not self.y[i] in self.output_dom.keys():
                self.output_dom[self.y[i]] = 1
            # otherwise, we increment the occurrence of this output in the training set by 1
            else:
                self.output_dom[self.y[i]] += 1
            # store the row
            self.data.append([self.X[i], self.y[i]])



    def classify(self, entry):

        solve = None # Final result
        max_arg = -1 # partial maximum

        for y in self.output_dom.keys():

            prob = self.output_dom[y]/self.N # P(y)

            for i in range(self.dim):
                cases = [x for x in self.data if x[0][i] == entry[i] and x[1] == y] # all rows with Xi = xi
                n = len(cases)
                prob *= n/self.N # P *= P(Xi = xi)

            # if we have a greater prob for this output than the partial maximum...
            if prob > max_arg:
                max_arg = prob
                solve = y

        return solve


In [2]:
import pandas as pd

data = pd.read_csv('training.csv')

data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,low,vhigh,5more,more,small,low,unacc
1,high,high,2,2,big,med,unacc
2,low,vhigh,3,2,med,med,unacc
3,vhigh,low,5more,2,big,med,unacc
4,vhigh,vhigh,4,2,big,med,unacc


In [3]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

for i in data.columns:
    data[i]=le.fit_transform(data[i])

data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,1,3,3,2,2,1,2
1,0,0,0,0,0,2,2
2,1,3,1,0,1,2,2
3,3,1,3,0,0,2,2
4,3,3,2,0,0,2,2


In [4]:
#y = list(map(lambda v: 'acc' if v == 1 else 'unacc', data['evaluation'].values)) # target values as string

o_data = pd.read_csv('training.csv')
y = o_data['evaluation'].values 
y = y.tolist()
## read the original file and get the result

X = data[["buying","maint","doors","persons","lug_boot","safety"]].values # features values


In [5]:
print(len(y))

# take 800 examples to train and the rest to the validation process
y_train = y[:800]
y_val = y[800:]

X_train = X[:800]
X_val = X[800:]


1330


In [6]:
## Creating the Naive Bayes Classifier instance with the training data

nbc = NaiveBayesClassifier(X_train, y_train)


total_cases = len(y_val) # size of validation set

# Well classified examples and bad classified examples
good = 0
bad = 0

for i in range(total_cases):
    predict = nbc.classify(X_val[i])
#     print(y_val[i] + ' --------------- ' + predict)
    if y_val[i] == predict:
        good += 1
    else:
        bad += 1

print('TOTAL EXAMPLES:', total_cases)
print('RIGHT:', good)
print('WRONG:', bad)
print('ACCURACY:{:.2f}%'.format(100*good/total_cases))


TOTAL EXAMPLES: 530
RIGHT: 394
WRONG: 136
ACCURACY:74.34%


In [7]:
test_data = pd.read_csv('test.csv')
## read the test data

test_data = pd.DataFrame(test_data)

le=LabelEncoder()

for i in test_data.columns:
    test_data[i]=le.fit_transform(test_data[i])
## converted string categories to integers

result = test_data

result = result.values.tolist()

r = []
for i in result:
    r.append(nbc.classify(i))
## add the result to a list
r= pd.DataFrame(r)

In [8]:
result = pd.read_csv('test.csv')

result = pd.DataFrame(result)

result['evaluation'] = r
## add the results to the test set

result

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,low,med,4,2,big,low,unacc
1,vhigh,med,2,2,med,high,unacc
2,vhigh,high,4,4,big,high,unacc
3,high,vhigh,4,2,small,med,unacc
4,vhigh,high,5more,2,small,high,unacc
...,...,...,...,...,...,...,...
328,high,low,2,more,big,low,unacc
329,high,med,4,2,big,low,unacc
330,high,med,4,4,big,high,unacc
331,high,med,2,4,med,low,unacc
