In [9]:
# COMP30027 Machine learning Project 1: Gaining Information about Naive Bayes
# Author: Jordan Ung <jordanu@student.unimelb.edu.au> [729938]
# Last Modified: 04.04.19

#### GENERAL GUIDE
# Read In And Inspect The Data
# Check for missing value - (1) delete rows with missing values, (2) Impute the missing values with dataset
# Check for anomalously extreme values

# TODO

# Work on Question 1, and Question 4
# Implement a Cross Evaluation Evaluator

In [10]:
# Import dependencies
import math

In [11]:
# Preprocess takes the name of a file and returns a list of instances
# within that file, with each instance containing a list of attributes
def preprocess(file_name):
    dataset = []
    with open(file_name, 'r') as file:
        # Add each instance to a list to be used later
        for line in file.readlines():
            dataset.append(line.strip().split(','))
            
    # Group all the instances with the same class together
    dataset = sorted(dataset, key=lambda x: x[-1])
    return dataset

all_files = ['anneal.csv', 'breast-cancer.csv', 'car.csv', 'cmc.csv', 'hepatitis.csv', 'hypothyroid.csv', 'mushroom.csv', 'nursery.csv', 'primary-tumor.csv']
new_file = "_unit_test.csv"

In [12]:
# Train takes a list of instances and returns a 3-tuple containing:
# A dictionary of the class distribution of all classes in the dataset
# A list of dictionaries, tallying each attribute value for every attribute
# A dictionary (for each class) of lists of dictionaries tallying 
# attribute values for every attribute of a particular class
def train(instance_list, missing_value):
    data_info = ({}, [], {})

    current_class = instance_list[0][-1]
    data_info[0][current_class] = 0
    data_info[2][current_class] = []
    class_index = 0
    
    # Add attribute lists to store the unique attribute values in
    for i in range(len(instance_list[0]) - 1):
        data_info[1].append({})
        data_info[2][current_class].append({})
    
    # Tally each value in each attribute for each class
    for data in instance_list:
#         print(data)
        attribute_num = 0

        # New class has been detected
        if data[-1] != current_class:

            # Add data structure to support new class
            current_class = data[-1]
            data_info[0][current_class] = 0
            data_info[2][current_class] = []
            class_index += 1
            
            # Add dictionary for each attribute
            for i in range(len(data) - 1):
                data_info[2][current_class].append({})
        
        # Input each instance's attribute into the appropriate dictionary
        for attribute in data[:-1]:
            
            if attribute in data_info[1][attribute_num]:
                data_info[1][attribute_num][attribute] += 1
            else:
                data_info[1][attribute_num][attribute] = 1
                
            if attribute in data_info[2][current_class][attribute_num]:
                data_info[2][current_class][attribute_num][attribute] += 1
            else:
                data_info[2][current_class][attribute_num][attribute] = 1
            attribute_num += 1
        
        data_info[0][current_class] += 1
        
    return data_info # Return the 3-tuple


# train(preprocess(missing_value_files[0]), "?")
# train(preprocess(new_file), "?")

# train(preprocess(missing_value_files[-1]), "?")

In [13]:
# Predict function takes two arguments, a learner model and a
# dataset and attempts to predict the class of a certain instance
def predict(model, instances):
    predicted_classes = []
    possible_classes = list(model[0].keys())
    
    # Find class prediction for each instance
    for data in instances:
        probability_of_class = 1.0
        class_probabilities = []
        
        # Calculate probability of the instance belonging to a particular class
        for class_name in possible_classes:
            attribute_list = model[2][class_name]
            
            # Multiply all of the attribute probabilities
            for attribute in range(len(attribute_list)):
                if data[attribute] in attribute_list[attribute]:
                    probability_of_class *= (attribute_list[attribute][data[attribute]] + 1) / (len(model[1][attribute].keys()) + model[0][class_name])
                else:
                    probability_of_class /= (len(model[1][attribute].keys()) + model[0][class_name])
            probability_of_class *= model[0][class_name] / sum(model[0].values())
            class_probabilities.append(probability_of_class)
            probability_of_class = 1.0
        
        class_index = 0
        highest_probability = 0
        # Predict the class with the highest probability
        for i in range(len(class_probabilities)):
            if class_probabilities[i] > highest_probability:
                highest_probability = class_probabilities[i]
                class_index = i
        predicted_classes.append(possible_classes[class_index])
        
    return predicted_classes # Return a list of predicted classes

In [20]:
# Evaluates the performance of the predictor model
# The metric/s evaluated are as follows: Accuracy
def evaluate(predictions, dataset):
    tries = 0
    correct = 0
    
    for i in range(len(predictions)):
        if predictions[i] == dataset[i][-1]:
            correct += 1
        tries += 1
        
    print("Correct:", correct, "out of", tries)
    print("Accuracy Rate (%): ", round(correct / tries * 100, 2))
    print("-----------------------------------")
    return


for i in all_files:
    print(i)
    evaluate(predict(train(preprocess(i), "?"), preprocess(i)), preprocess(i))


anneal.csv
Correct: 828 out of 898
Accuracy Rate (%):  92.2
-----------------------------------
breast-cancer.csv
Correct: 216 out of 286
Accuracy Rate (%):  75.52
-----------------------------------
car.csv
Correct: 1506 out of 1728
Accuracy Rate (%):  87.15
-----------------------------------
cmc.csv
Correct: 745 out of 1473
Accuracy Rate (%):  50.58
-----------------------------------
hepatitis.csv
Correct: 130 out of 155
Accuracy Rate (%):  83.87
-----------------------------------
hypothyroid.csv
Correct: 3011 out of 3163
Accuracy Rate (%):  95.19
-----------------------------------
mushroom.csv
Correct: 7772 out of 8124
Accuracy Rate (%):  95.67
-----------------------------------
nursery.csv
Correct: 11703 out of 12960
Accuracy Rate (%):  90.3
-----------------------------------
primary-tumor.csv
Correct: 192 out of 339
Accuracy Rate (%):  56.64
-----------------------------------


In [27]:
# Calculate Information Gain of an attribute given the root node
# In other words, which attribute is best to split the instances
def info_gain(model):
    info_gain_values = []
    print(model)
    
    # Calculate Entropy of Root Node, a.k.a class distribution entropy
    root_entropy = 0
    for class_name in model[0]:
        pr_attribute = model[0][class_name] * 1.0 / sum(model[0].values())
        root_entropy -= pr_attribute * math.log2(pr_attribute)
    
    # Traverse each attribute in model
    for attribute_index in range(len(model[1])):
        mean_info_list = []
        mean_info = 0
        
        # Calculate entropy of each unique attribute value
        for attribute in model[1][attribute_index]:
            print("Attribute:", attribute)
            entropy = 0
            attribute_freq = []
            # Append each class' attribute's frequency
            for class_index in model[0].keys():
                if attribute in model[2][class_index][attribute_index]:
                    attribute_freq.append(model[2][class_index][attribute_index][attribute])
            
            # Calculate entropy and add to mean_info
            if len(attribute_freq) == 1:
                continue
            else:
                for element in attribute_freq:
                    probability = element * 1.0 / sum(attribute_freq)
                    entropy -= probability * math.log2(probability)
            
            # Calculate Mean Info of an attribute
            mean_info = entropy * (model[1][attribute_index][attribute] / sum(model[1][attribute_index].values()))
            mean_info_list.append(mean_info)
            
        # Calculate the IG for an attribute with respect to the root node
        for value in mean_info_list:
            info_gain_values.append(root_entropy - value)
    return info_gain_values

info_gain(train(preprocess(new_file), "?"))
# for i in all_files:
#     print(i)
#     info_gain(train(preprocess(i), "?"))

({'cold': 2, 'flu': 3}, [{'no': 1, 'mild': 2, 'severe': 2}, {'severe': 2, 'no': 1, 'mild': 2}, {'normal': 4, 'high': 1}, {'yes': 4, 'no': 1}], {'cold': [{'no': 1, 'mild': 1}, {'severe': 1, 'no': 1}, {'normal': 2}, {'yes': 1, 'no': 1}], 'flu': [{'severe': 2, 'mild': 1}, {'mild': 2, 'severe': 1}, {'high': 1, 'normal': 2}, {'yes': 3}]})
0.9709505944546686
Attribute: no
Attribute: mild
Attribute: severe
{'no': 1, 'mild': 2, 'severe': 2}
Attribute: severe
Attribute: no
Attribute: mild
{'severe': 2, 'no': 1, 'mild': 2}
Attribute: normal
Attribute: high
{'normal': 4, 'high': 1}
Attribute: yes
Attribute: no
{'yes': 4, 'no': 1}
[0.5709505944546686, 0.5709505944546686, 0.17095059445466854, 0.3219280948873623]


[0.5709505944546686,
 0.5709505944546686,
 0.17095059445466854,
 0.3219280948873623]

In [28]:
'''
Question 1: The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on 
the given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, 
relative to the class distribution - does this help to explain the classifiers' behaviour? 
Identify any results that are particularly surprising, and explain why they occur.


'''

"\nQuestion 1: The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on \nthe given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, \nrelative to the class distribution - does this help to explain the classifiers' behaviour? \nIdentify any results that are particularly surprising, and explain why they occur.\n\n\n"

In [33]:
2/5*math.log2(5/2) + 3/5*math.log2(5/3)

0.9709505944546687