In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data

data = pd.read_csv('data.csv')
data.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
# split dataset into train and test

ratio = 0.8
n_train = int(ratio * len(data))
n_test = len(data) - n_train
indexes = np.zeros(len(data))
indexes[np.random.choice(data.shape[0], n_train, replace=False)] = 1

train_data = data[indexes == 1]
test_data = data[indexes == 0]

In [4]:
# define tagret and features
target_column = 'class'
features = list(data)

# count probabilities for classes
possible_classes = train_data[target_column].unique()
p_class = {}
for class_name in possible_classes:
    p_class[class_name] = train_data[target_column].value_counts()[class_name] / len(train_data)
p_class

{'acc': 0.22431259044862517,
 'good': 0.041244573082489147,
 'unacc': 0.69536903039073805,
 'vgood': 0.039073806078147609}

In [5]:
# count likelyhood table for each feature and feature value

likelyhood_table = {}
for class_name in possible_classes:
    for feature in features:
        if feature != target_column:
            feature_values = train_data[feature].unique()
            only_current_class = train_data[train_data[target_column] == class_name]
            recount = False
            for feature_value in feature_values:
                key = (class_name, feature, feature_value)
                value = 0
                if feature_value in only_current_class[feature].value_counts():
                    value = only_current_class[feature].value_counts()[feature_value] * 1.0 / len(only_current_class)
                else:
                    recount = True
                likelyhood_table[key] = value
            # avoid zero probability if no such feature value in this class in train data set
            if recount:
                for feature_value in feature_values:
                    key = (class_name, feature, feature_value)
                    previous = likelyhood_table[key]
                    updated = (previous * len(only_current_class)) / (len(only_current_class) + 1)
                    if previous == 0:
                        updated = 1 / (len(only_current_class) + 1)
                    likelyhood_table[key] = updated
likelyhood_table

{('acc', 'buying', 'high'): 0.27741935483870966,
 ('acc', 'buying', 'low'): 0.22580645161290322,
 ('acc', 'buying', 'med'): 0.3032258064516129,
 ('acc', 'buying', 'vhigh'): 0.19354838709677419,
 ('acc', 'doors', '2'): 0.2129032258064516,
 ('acc', 'doors', '3'): 0.26451612903225807,
 ('acc', 'doors', '4'): 0.24516129032258063,
 ('acc', 'doors', '5more'): 0.27741935483870966,
 ('acc', 'lug_boot', 'big'): 0.3774193548387097,
 ('acc', 'lug_boot', 'med'): 0.35161290322580646,
 ('acc', 'lug_boot', 'small'): 0.2709677419354839,
 ('acc', 'maint', 'high'): 0.28387096774193549,
 ('acc', 'maint', 'low'): 0.23225806451612904,
 ('acc', 'maint', 'med'): 0.29999999999999999,
 ('acc', 'maint', 'vhigh'): 0.18387096774193548,
 ('acc', 'persons', '2'): 0.003215434083601286,
 ('acc', 'persons', '4'): 0.50482315112540188,
 ('acc', 'persons', 'more'): 0.49196141479099681,
 ('acc', 'safety', 'high'): 0.52090032154340837,
 ('acc', 'safety', 'low'): 0.003215434083601286,
 ('acc', 'safety', 'med'): 0.4758842443

In [6]:
# predicted class column name
predicted = 'predicted'

# make predictions for test objects
for row in test_data.iterrows():
    
    # count probability that object belongs to class for each class
    possible = {}
    for possible_class in possible_classes:
        possible_value = p_class[possible_class]
        row_attrs = list(test_data)
        for attr in row_attrs:
            if attr != target_column and attr != predicted:
                attr_value = test_data.get_value(row[0], attr)
                key = (possible_class, attr, attr_value)
                value = 0
                if key in likelyhood_table:
                    value = likelyhood_table[key]
                possible_value *= value
        possible[possible_class] = possible_value
    
    # select class with max probability    
    max_value = 0
    result_class = '-'
    for key, value in possible.items():
        if value > max_value:
            max_value = value
            result_class = key

    # write to column
    test_data = pd.DataFrame(test_data).set_value(row[0], predicted, result_class)

In [7]:
def accuracy(test_data):
    # count accuracy
    return sum(test_data[target_column] == test_data[predicted]) / len (test_data)

In [8]:
print ('Accuracy = ', accuracy(test_data))
test_data.head(10)

Accuracy =  0.893063583815


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class,predicted
1,vhigh,vhigh,2,2,small,med,unacc,unacc
6,vhigh,vhigh,2,2,big,low,unacc,unacc
12,vhigh,vhigh,2,4,med,low,unacc,unacc
13,vhigh,vhigh,2,4,med,med,unacc,unacc
23,vhigh,vhigh,2,more,med,high,unacc,unacc
25,vhigh,vhigh,2,more,big,med,unacc,unacc
34,vhigh,vhigh,3,2,big,med,unacc,unacc
35,vhigh,vhigh,3,2,big,high,unacc,unacc
47,vhigh,vhigh,3,more,small,high,unacc,unacc
50,vhigh,vhigh,3,more,med,high,unacc,unacc
