In [6]:
# COMP30027 Machine learning Project 1: Gaining Information about Naive Bayes
# Author: Jordan Ung <jordanu@student.unimelb.edu.au> [729938]
# Last Modified: 04.04.19

#### GENERAL GUIDE
# Read In And Inspect The Data
# Check for missing value - (1) delete rows with missing values, (2) Impute the missing values with dataset
# Check for anomalously extreme values

# TODO

# Work on Question 1, and Question 4
# Implement a Cross Evaluation Evaluator

In [7]:
# Import dependencies
import math
import random

In [8]:
# Preprocess takes the name of a file and returns a list of instances
# within that file, with each instance containing a list of attributes
def preprocess(file_name):
    dataset = []
    with open(file_name, 'r') as file:
        # Add each instance to a list to be used later
        for line in file.readlines():
            dataset.append(line.strip().split(','))
            
    # Group all the instances with the same class together
    dataset = sorted(dataset, key=lambda x: x[-1])
    return dataset

all_files = ['anneal.csv', 'breast-cancer.csv', 'car.csv', 'cmc.csv', 'hepatitis.csv', 'hypothyroid.csv', 'mushroom.csv', 'nursery.csv', 'primary-tumor.csv']
new_file = "_unit_test.csv"

In [9]:
# Train takes a list of instances and returns a 3-tuple containing:
# A dictionary of the class distribution of all classes in the dataset
# A list of dictionaries, tallying each attribute value for every attribute
# A dictionary (for each class) of lists of dictionaries tallying 
# attribute values for every attribute of a particular class
def train(instance_list):
    data_info = ({}, [], {})

    current_class = instance_list[0][-1]
    data_info[0][current_class] = 0
    data_info[2][current_class] = []
    class_index = 0
    
    # Add attribute lists to store the unique attribute values in
    for i in range(len(instance_list[0]) - 1):
        data_info[1].append({})
        data_info[2][current_class].append({})
    
    # Tally each value in each attribute for each class
    for data in instance_list:
#         print(data)
        attribute_num = 0

        # New class has been detected
        if data[-1] != current_class:

            # Add data structure to support new class
            current_class = data[-1]
            data_info[0][current_class] = 0
            data_info[2][current_class] = []
            class_index += 1
            
            # Add dictionary for each attribute
            for i in range(len(data) - 1):
                data_info[2][current_class].append({})
        
        # Input each instance's attribute into the appropriate dictionary
        for attribute in data[:-1]:
            
            if attribute in data_info[1][attribute_num]:
                data_info[1][attribute_num][attribute] += 1
            else:
                data_info[1][attribute_num][attribute] = 1
                
            if attribute in data_info[2][current_class][attribute_num]:
                data_info[2][current_class][attribute_num][attribute] += 1
            else:
                data_info[2][current_class][attribute_num][attribute] = 1
            attribute_num += 1
        
        data_info[0][current_class] += 1
        
    return data_info # Return the 3-tuple


# train(preprocess(missing_value_files[0]), "?")
# train(preprocess(new_file), "?")

# train(preprocess(missing_value_files[-1]), "?")

In [10]:
# Predict function takes two arguments, a learner model and a
# dataset and attempts to predict the class of a certain instance
def predict(model, instances):
    predicted_classes = []
    possible_classes = list(model[0].keys())
    
    # Find class prediction for each instance
    for data in instances:
        probability_of_class = 1.0
        class_probabilities = []
        
        # Calculate probability of the instance belonging to a particular class
        for class_name in possible_classes:
            attribute_list = model[2][class_name]
            
            # Multiply all of the attribute probabilities
            for attribute in range(len(attribute_list)):
                if data[attribute] in attribute_list[attribute]:
                    probability_of_class *= (attribute_list[attribute][data[attribute]] + 1) / (len(model[1][attribute].keys()) + model[0][class_name])
                else:
                    probability_of_class /= (len(model[1][attribute].keys()) + model[0][class_name])
            probability_of_class *= model[0][class_name] / sum(model[0].values())
            class_probabilities.append(probability_of_class)
            probability_of_class = 1.0
        
        class_index = 0
        highest_probability = 0
        # Predict the class with the highest probability
        for i in range(len(class_probabilities)):
            if class_probabilities[i] > highest_probability:
                highest_probability = class_probabilities[i]
                class_index = i
        predicted_classes.append(possible_classes[class_index])
        
    return predicted_classes # Return a list of predicted classes

In [11]:
# Evaluates the performance of the predictor model
# The metric/s evaluated are as follows: Accuracy
def evaluate(predictions, dataset):
    tries = 0
    correct = 0
    
    for i in range(len(predictions)):
        if predictions[i] == dataset[i][-1]:
            correct += 1
        tries += 1
        
    print("Correct:", correct, "out of", tries)
    print("Accuracy Rate (%): ", round(correct / tries * 100, 2))
    print("-----------------------------------")
    return (correct / tries)


for i in all_files:
    print(i)
    evaluate(predict(train(preprocess(i)), preprocess(i)), preprocess(i))


anneal.csv
Correct: 828 out of 898
Accuracy Rate (%):  92.2
-----------------------------------
breast-cancer.csv
Correct: 216 out of 286
Accuracy Rate (%):  75.52
-----------------------------------
car.csv
Correct: 1506 out of 1728
Accuracy Rate (%):  87.15
-----------------------------------
cmc.csv
Correct: 745 out of 1473
Accuracy Rate (%):  50.58
-----------------------------------
hepatitis.csv
Correct: 130 out of 155
Accuracy Rate (%):  83.87
-----------------------------------
hypothyroid.csv
Correct: 3011 out of 3163
Accuracy Rate (%):  95.19
-----------------------------------
mushroom.csv
Correct: 7772 out of 8124
Accuracy Rate (%):  95.67
-----------------------------------
nursery.csv
Correct: 11703 out of 12960
Accuracy Rate (%):  90.3
-----------------------------------
primary-tumor.csv
Correct: 192 out of 339
Accuracy Rate (%):  56.64
-----------------------------------


In [12]:
# Calculate Information Gain of an attribute given the root node
# In other words, which attribute is best to split the instances
def info_gain(model):
    info_gain_values = []
    print(model)
    
    # Calculate Entropy of Root Node, a.k.a class distribution entropy
    root_entropy = 0
    for class_name in model[0]:
        pr_attribute = model[0][class_name] * 1.0 / sum(model[0].values())
        root_entropy -= pr_attribute * math.log2(pr_attribute)
    
    # Traverse each attribute in model
    for attribute_index in range(len(model[1])):
        mean_info_list = []
        mean_info = 0
        
        # Calculate entropy of each unique attribute value
        for attribute in model[1][attribute_index]:
            entropy = 0
            attribute_freq = []
            # Append each class' attribute's frequency
            for class_index in model[0].keys():
                if attribute in model[2][class_index][attribute_index]:
                    attribute_freq.append(model[2][class_index][attribute_index][attribute])
            
            # Calculate entropy and add to mean_info
            if len(attribute_freq) == 1:
                continue
            else:
                for element in attribute_freq:
                    probability = element * 1.0 / sum(attribute_freq)
                    entropy -= probability * math.log2(probability)
            
            # Calculate Mean Info of an attribute
            mean_info = entropy * (model[1][attribute_index][attribute] / sum(model[1][attribute_index].values()))
            mean_info_list.append(mean_info)
            
        # Calculate the IG for an attribute with respect to the root node
        for value in mean_info_list:
            info_gain_values.append(root_entropy - value)
    return info_gain_values

In [24]:
'''
Question 1: The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on 
the given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, 
relative to the class distribution - does this help to explain the classifiers' behaviour? 
Identify any results that are particularly surprising, and explain why they occur.

The two files below, mushroom.csv and cmc.csv are the two datasets with the largest difference
in accuracies of classes predicted by the classifier, with mushroom.csv having a 95.67% accuracy
while only a 50.58% accuracy from cmc.csv. By initially looking at the information gained from each
attribute in both datasets, cmc.csv seems to have a much higher Information Gain per attribute on
average, but that is not factored in the amount of classes, which favours highly-branching class
distributions. By looking at some of the attributes - for example {'Islam': 1253, 'Non-Islam': 220},
it's surprising that this dataset has such a low accuracy. Though when looking at the class distributions
as a whole, one class has substancially less instances of that class, such that predicting that class relative
to the others is much more harder, leading to a more inaccuracte result. That is also why mushroom.csv has such
a good accuracy - there are enough instances of each class that the NB implementation is able to reliably
build a model that reflects the nature of, in this case both instances. So while there are highly branching
attributes within cmc.csv, they actually inflate the Information Gain and don't help build a reliable
predictor model. '''
print(info_gain(train(preprocess("mushroom.csv"))))
evaluate(predict(train(preprocess("mushroom.csv")), preprocess("mushroom.csv")), preprocess("mushroom.csv"))
print(info_gain(train(preprocess("cmc.csv"))))
evaluate(predict(train(preprocess("cmc.csv")), preprocess("cmc.csv")), preprocess("cmc.csv"))

'''
Question 4: Evaluating the model on the same data that we use to train the model is considered to be a major
mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy.
How does your estimate of effectiveness change, compared to testing on the training data?
Explain why. (The result might surprise you!)
'''

def crosseval(file, m):
    dataset = preprocess(file)
    accuracy = 0
    # Split instances into m partitions to be cross evaluated
    data_fragments = []
    size_of_partitions = math.floor(len(dataset) / m)
    remainder = len(dataset) % m
    
    for i in range(m):
        data_fragments.append([])
    
    # Randomise the ordering of the dataset and partition
    list_num = 0
    for data_index in range(len(dataset)):
        other_index = random.randint(data_index,len(dataset) - 1)
        dataset[data_index], dataset[other_index] = dataset[other_index], dataset[data_index]

        data_fragments[list_num % m].append(dataset[data_index])
        list_num += 1
    
    # Perform Cross Validation on each of the m folds
    for fragment_index in range(m):
        new_dataset = []
        test_data = data_fragments[fragment_index]
        
        # Prepare all the training data
        for index in range(m):
            if index != fragment_index:
                new_dataset += data_fragments[index]
        
        print("Partition:", fragment_index)
        accuracy += evaluate(predict(train(new_dataset), test_data), test_data)
    print("Overall Accuracy for", file, "is (%):", (accuracy / m * 100))
    return

# crosseval("mushroom.csv", 10)


({'e': 4208, 'p': 3916}, [{'x': 3656, 'b': 452, 's': 32, 'f': 3152, 'k': 828, 'c': 4}, {'s': 2556, 'y': 3244, 'f': 2320, 'g': 4}, {'y': 1072, 'w': 1040, 'g': 1840, 'n': 2284, 'e': 1500, 'b': 168, 'u': 16, 'c': 44, 'p': 144, 'r': 16}, {'t': 3376, 'f': 4748}, {'a': 400, 'l': 400, 'n': 3528, 'p': 256, 'f': 2160, 'c': 192, 'y': 576, 's': 576, 'm': 36}, {'f': 7914, 'a': 210}, {'c': 6812, 'w': 1312}, {'b': 5612, 'n': 2512}, {'k': 408, 'n': 1048, 'g': 752, 'w': 1202, 'p': 1492, 'h': 732, 'u': 492, 'e': 96, 'y': 86, 'o': 64, 'b': 1728, 'r': 24}, {'e': 3516, 't': 4608}, {'c': 556, 'e': 1120, 'b': 3776, 'r': 192, '?': 2480}, {'s': 5176, 'f': 552, 'k': 2372, 'y': 24}, {'s': 4936, 'f': 600, 'y': 284, 'k': 2304}, {'w': 4464, 'g': 576, 'p': 1872, 'e': 96, 'o': 192, 'n': 448, 'b': 432, 'c': 36, 'y': 8}, {'w': 4384, 'p': 1872, 'g': 576, 'n': 512, 'e': 96, 'o': 192, 'b': 432, 'y': 24, 'c': 36}, {'p': 8124}, {'w': 7924, 'n': 96, 'o': 96, 'y': 8}, {'o': 7488, 't': 600, 'n': 36}, {'p': 3968, 'e': 2776, 'f

In [19]:
m = []
t = [10,5]
s = [2, 3]

print(m+t+s)

[10, 5, 2, 3]


In [2]:
a = [12, 22, 32, 42]


[12, 22, 32, 42]
[12, 32, 22, 42]
