In [1]:
# Import
import random
from itertools import permutations

In [2]:
# Helper functions
def calculate_prior(prior_dict, data_dict):
    
    for key, value in data_dict.items():
        for ks, vs in value.items():
            prior_dict[ks] += vs
    
    for key, value in prior_dict.items():
        prior_dict[key] = value / len(data_dict)
        
    return prior_dict

def print_matrix(data, predict_list):
    # Calculate the accuracy of predicted classes
    count = 0
    true_list = []
    for i in range(len(data)):
        true_list.append(data[i][-1])
        if data[i][-1] == predict_list[i]:
            count += 1
    
    # Calculate the confusion matrix of result
    class_dict = dict([(line[-1], 0) for line in data])
    for k in class_dict.keys():
        class_dict[k] = dict([(line[-1], 0) for line in data])
    
    for i in range(len(true_list)):
        class_dict[true_list[i]][predict_list[i]] += 1
        
    # Print format of confusion matrix
    print('{0:>20s}|'.format('Actual\Predict'), end = '')
    for k, v in class_dict.items():
        print('{0:>20s}|'.format(k), end='')
    print()
    for k, v in class_dict.items():
        print('{0:>20s}|'.format(k), end='')
        for values in v.values():
            print('{0:>20d}|'.format(values), end='')
        print()
    print("\nAccuracy: {0}\n".format(count / len(data)))
    
    return

In [15]:
# This function should open a data file in csv, and transform it into a usable format 

file_list = [ 'car.csv', 'hypothyroid.csv']

def preprocess(filename):
    """Read csv file from input filename

    Parameters
    ----------
    filename: name of csv file

    Returns
    -------
    data: 2-D array of data from csv file
    """
    file = open(filename, 'r')
    data = []
    
    for line in file.readlines():
        data.append(line.strip().split(','))
        
    file.close()
    return data

In [16]:
# This function should build a supervised NB model
def train_supervised(data):
    """Build a supervised Naive Bayes model

    Parameters
    ----------
    data: 2-D array of data from csv file

    Returns
    -------
    prior_dict: dictionary of prior probability
    poste_dict: 3-D dictionary of posterior probability
    """
    # Prior probability
    prior_dict = {}
    # Posterior probability
    poste_dict = {}
    
    # Calculate prior probability
    for line in data:
        # Calculate prior count in last column
        clas = line[-1]
        # Check class exists in dictionary
        if clas not in prior_dict:
            prior_dict[clas] = 1
        else:
            prior_dict[clas] += 1
    
    # Divide with instance number to get prior probability        
    for key, value in prior_dict.items():
        prior_dict[key] = value / len(data)
    
    # Calculate posterior probability
    for att in range(len(data[0])-1):
        clas_dict = {}
        for line in data:
            # Calculate posterior count in last column
            clas = line[-1]
            # Check class exists in dictionary
            if clas not in clas_dict:
                clas_dict[clas] = {}
            
            # Check attribute exists in dictionary
            if line[att] not in clas_dict[clas]:
                clas_dict[clas][line[att]] = 1
            else:
                clas_dict[clas][line[att]] += 1

        poste_dict[att] = clas_dict
        
        # Divide with instance number to get posterior probability
        for cla in prior_dict.keys():
            sum_value = sum(clas_dict[cla].values())
            for key, value in clas_dict[cla].items():
                clas_dict[cla][key] = value / sum_value

    return prior_dict, poste_dict

In [17]:
# This function should predict the class for a set of instances, based on a trained model 
def predict_supervised(data, prior_dict, poste_dict):
    """Predict the class based on a trained model

    Parameters
    ----------
    prior_dict: dictionary of prior probability
    poste_dict: 3-D dictionary of posterior probability

    Returns
    -------
    predict_list: list of predicted classes
    """
    predict_list = []
    sum_value = len(data)
    
    for line in data:
        predict_dict = {}
        for key, value in prior_dict.items():
            # Inital prior probability
            predict_dict[key] = value
            for index in range(len(line)-1):
                att = line[index]
                # Ignore missing value marked with ?
                if att != '?':
                    # Check attribute have corresponding posterior probability
                    if att in poste_dict[index][key]:
                        predict_dict[key] *= poste_dict[index][key][att]
                    else:
                        # Epsilon smoothing if no value exists
                        predict_dict[key] *= 0.01 / sum_value
        # Append the class with most possible probability value
        predict_list.append(max(predict_dict, key=predict_dict.get))
 
    return predict_list

In [18]:
# This function should evaluate a set of predictions, in a supervised context
def evaluate_supervised(data, filename, predict_list):
    """Evaluate the predictions for supervised NB

    Parameters
    ----------
    data: 2-D array of data from csv file
    predict_list: list of predicted classes
    """
    print('{0:*^105}'.format('supervised' + ' ' + filename.split('.')[0]))
    # Print confusion matrix
    print_matrix(data, predict_list)

    return

In [19]:
# Main function for supervised NB
def supervised(file_list):
    
    for filename in file_list:
        data = preprocess(filename)
        prior_dict, poste_dict = train_supervised(data)
        predict_list = predict_supervised(data, prior_dict, poste_dict)
        evaluate_supervised(data, filename, predict_list)
        
# Run the next line of code, can show the confusion matrix of all supervised dataset
supervised(file_list)

*********************************************supervised car**********************************************
      Actual\Predict|               unacc|                 acc|               vgood|                good|
               unacc|                1161|                  47|                   0|                   2|
                 acc|                  85|                 289|                   0|                  10|
               vgood|                   0|                  26|                  39|                   0|
                good|                   0|                  46|                   2|                  21|

Accuracy: 0.8738425925925926

*****************************************supervised hypothyroid******************************************
      Actual\Predict|         hypothyroid|            negative|
         hypothyroid|                   0|                 151|
            negative|                   0|                3012|

Accuracy: 0.9522605121719886



In [20]:
# This function should build an unsupervised NB model 
def train_unsupervised(data):
    """Build an unsupervised Naive Bayes model

    Parameters
    ----------
    data: 2-D array of data from csv file

    Returns
    -------
    prior_dict: dictionary of prior probability
    poste_dict: 3-D dictionary of posterior probability
    prob_dict: 2-D dictionary of calculated probaility
    """
    class_dict = dict([(line[-1], 0) for line in data])
    
    # Calculate random values
    random_dict = {}
    for index in range(len(data)):
        ins_dict = class_dict.copy()
        sum_random = 0
        for key in ins_dict.keys():
            random_value = random.random()
            ins_dict[key] = random_value
            sum_random += random_value
    
        # Normalise random values into sum of 1
        for key in class_dict.keys():
            ins_dict[key] = ins_dict[key] / sum_random
        random_dict[index] = ins_dict
    
    # Prior probability
    prior_dict = class_dict.copy()
    # Posterior probability
    poste_dict = {}
    
    # Calculate prior probability
    prior_dict = calculate_prior(prior_dict, random_dict)
    
    # Calculate posterior probability
    for att in range(len(data[0])-1):
        res_dict = {}
        
        # Add the random distributions together
        for i in range(len(data)):
            for key in class_dict.keys():
                
                # Check class exists in corresponding dictionary
                if key not in res_dict:
                    res_dict[key] = {}
                    
                # Ignore missing value marked with ?
                if data[i][att] != '?':
                    
                    # Check attribute exists in corresponding dictionary
                    if data[i][att] not in res_dict[key]:
                        res_dict[key][data[i][att]] = random_dict[i][key]
                    else:
                        res_dict[key][data[i][att]] += random_dict[i][key]
                    
        poste_dict[att] = res_dict
        
        # Normalise the random distribution
        for key in class_dict.keys():
            sum_value = sum(poste_dict[att][key].values())
            for ks, vs in poste_dict[att][key].items():
                poste_dict[att][key][ks] = vs / sum_value
    
    # Calculate the probability of predictions
    prob_dict = {}
    for ins in range(len(data)):
        value_dict = {}
        
        for cla in class_dict.keys():
            value_dict[cla] = prior_dict[cla]
            for att in range(len(data[ins])-1):
                
                # Ignore missing value marked with ?
                if data[ins][att] != '?':
                    value_dict[cla] *= poste_dict[att][cla][data[ins][att]]

        # Normalise the probability of predictions
        sum_value = sum(value_dict.values())
        for key, value in value_dict.items():
            value_dict[key] = value / sum_value
        prob_dict[ins] = value_dict
    
    # Iterate prior and posterior probability
    for i in range(5):
        prob_dict = iterate_unsupervised(data, prob_dict)
    
    return prior_dict, poste_dict, prob_dict

In [21]:
# Iterate unsupervised function
def iterate_unsupervised(data, prob_dict):
    """Predict the class based on a trained model

    Parameters
    ----------
    data: 2-D array of data from csv file
    prob_dict: 2-D dictionary of calculated probability

    Returns
    -------
    prob_dict: 2-D dictionary of new calculated probaility
    """
    # Prior probability
    prior_dict = dict([(line[-1], 0) for line in data])
    # Posterior probability
    poste_dict = {}
        
    prior_dict = calculate_prior(prior_dict, prob_dict)
    
    # Calculate posterior probability
    for att in range(len(data[0])-1):
        res_dict = {}
        
        for i in range(len(data)):
            for k in prior_dict.keys():
                if k not in res_dict:
                    res_dict[k] = {}
                # Ignore missing value marked with ?
                if data[i][att] != '?':
                    # Check attribute exists in corresponding dictionary
                    if data[i][att] not in res_dict[k]:
                        res_dict[k][data[i][att]] = prob_dict[i][k]
                    else:
                        res_dict[k][data[i][att]] += prob_dict[i][k]
                    
        poste_dict[att] = res_dict
    
    #  Divide with instance number to get posterior probability
    for att in range(len(data[0])-1):
        for k in prior_dict.keys():
            sum_value = sum(poste_dict[att][k].values())
            for ks, vs in poste_dict[att][k].items():
                poste_dict[att][k][ks] = vs / sum_value
    
    # Calculate the probability of predictions
    prob_dict = {}
    for ins in range(len(data)):
        value_dict = {}
        
        for cla in prior_dict.keys():
            value_dict[cla] = prior_dict[cla]
            for att in range(len(data[ins])-1):
                # Ignore missing value marked with ?
                if data[ins][att] != '?':
                    value_dict[cla] *= poste_dict[att][cla][data[ins][att]]
        # Normalise the probability of predictions
        sum_value = sum(value_dict.values())
        for key, value in value_dict.items():
            value_dict[key] = value / sum_value
        prob_dict[ins] = value_dict
    
    return prob_dict

In [22]:
# This function should predict the class distribution for a set of instances, based on a trained model
def predict_unsupervised(data, prob_dict):
    """Predict the class based on a trained model

    Parameters
    ----------
    data: 2-D array of data from csv file
    prob_dict: 2-D dictionary of calculated probability

    Returns
    -------
    result: list of maximum probability of each instances
    classifier: list of corresponding index from result
    """
    # List of maximum probability of predictions
    result = []
    for i in range(len(data)):
        result.append(max(prob_dict[i], key=prob_dict[i].get))
    
    # List of classifier
    prior_list = list(set([line[-1] for line in data]))
    classifier = []
    for res in result:
        for index in range(len(prior_list)):
            if res == prior_list[index]:
                classifier.append(index)
    
    return result, classifier

In [23]:
# This function should evaluate a set of predictions, in an unsupervised manner
def evaluate_unsupervised(data, filename, result, classifier):
    """Evaluate the predictions for unsupervised NB

    Parameters
    ----------
    data: 2-D array of data from csv file
    filename: name of file
    result: list of maximum probability of each instances
    classifier: list of corresponding index from result
    """
    # Swapping method
    # Get the permutaions of classes
    class_num = len(set(result))
    class_list = list(set(result))
    class_list *= class_num

    predict_class = set()
    for i in list(permutations(class_list, class_num)):
        predict_class.add(i)
    
    # Actual class in data
    actual_list = [line[-1] for line in data]
    
    # List of all possible classes
    result_list = []
    for clas in list(predict_class):
        res = []
        for index in classifier:
            res.append(clas[index])
        result_list.append(res)
    
    # Calculate the correctness of all predictions
    count_list = []
    for res in result_list:
        count = 0
        for index in range(len(res)):
            if res[index] == actual_list[index]:
                count += 1
        count_list.append(count/len(res))
    
    # Get the maximum probability of class
    index = count_list.index(max(count_list))
    predict_list = result_list[index]
    
    print('{0:*^105}'.format("unsupervised" + ' ' + filename.split('.')[0]))
    # Print confusion matrix
    print_matrix(data, predict_list)

    return

In [24]:
# Main function for unsupervised NB
def unsupervised(file_list):
    
    for filename in file_list:
        data = preprocess(filename)
        prior_dict, poste_dict, prob_dict = train_unsupervised(data)
        result, classifier = predict_unsupervised(data, prob_dict)
        evaluate_unsupervised(data, filename, result, classifier)

# Run the next line of code, can show the confusion matrix of all dataset
unsupervised(file_list)

********************************************unsupervised car*********************************************
      Actual\Predict|               unacc|                 acc|               vgood|                good|
               unacc|                1210|                   0|                   0|                   0|
                 acc|                 384|                   0|                   0|                   0|
               vgood|                  65|                   0|                   0|                   0|
                good|                  69|                   0|                   0|                   0|

Accuracy: 0.7002314814814815

****************************************unsupervised hypothyroid*****************************************
      Actual\Predict|         hypothyroid|            negative|
         hypothyroid|                   0|                 151|
            negative|                   0|                3012|

Accuracy: 0.9522605121719886



In [25]:

file_lst = ['breast-cancer.csv', 'hypothyroid.csv', 'mushroom.csv']
supervised(file_lst)
unsupervised(file_lst)

# Optimal (The accuracy for unsupervised mushroom is unstable, range: 0.51~0.89)
# ******************************************unsupervised mushroom******************************************
#       Actual\Predict|                   p|                   e|
#                    p|                3089|                 827|
#                    e|                   1|                4207|
#
# Accuracy: 0.8980797636632201

****************************************supervised breast-cancer*****************************************
      Actual\Predict|   recurrence-events|no-recurrence-events|
   recurrence-events|                  46|                  39|
no-recurrence-events|                  31|                 170|

Accuracy: 0.7552447552447552

*****************************************supervised hypothyroid******************************************
      Actual\Predict|         hypothyroid|            negative|
         hypothyroid|                   0|                 151|
            negative|                   0|                3012|

Accuracy: 0.9522605121719886

*******************************************supervised mushroom*******************************************
      Actual\Predict|                   p|                   e|
                   p|                3867|                  49|
                   e|                  20|                4188|

Accuracy: 0.991506646971935

**************

In [26]:

supervised(file_list)

*********************************************supervised car**********************************************
      Actual\Predict|               unacc|                 acc|               vgood|                good|
               unacc|                1161|                  47|                   0|                   2|
                 acc|                  85|                 289|                   0|                  10|
               vgood|                   0|                  26|                  39|                   0|
                good|                   0|                  46|                   2|                  21|

Accuracy: 0.8738425925925926

*****************************************supervised hypothyroid******************************************
      Actual\Predict|         hypothyroid|            negative|
         hypothyroid|                   0|                 151|
            negative|                   0|                3012|

Accuracy: 0.9522605121719886

