In [2]:
# COMP30027 Machine learning Project 1: Gaining Information about Naive Bayes
# Author: Jordan Ung <jordanu@student.unimelb.edu.au> [729938]
# Last Modified: 03.04.19

#### GENERAL GUIDE
# Read In And Inspect The Data
# Check for missing value - (1) delete rows with missing values, (2) Impute the missing values with dataset
# Check for anomalously extreme values

# TODO
# Fix Info_Gain(), there should not be a case where IG < 0
# Note, entropy with n unique values has max value of log2(n)

# Work on Question 1, and Question 4
# Implement a Cross Evaluation Evaluator

In [3]:
# Import dependencies
import math

In [4]:
# Preprocess takes the name of a file and returns a list of instances
# within that file, with each instance containing a list of attributes
def preprocess(file_name):
    dataset = []
    with open(file_name, 'r') as file:
        # Add each instance to a list to be used later
        for line in file.readlines():
            dataset.append(line.strip().split(','))
            
    # Group all the instances with the same class together
    dataset = sorted(dataset, key=lambda x: x[-1])
    return dataset

data_file = 'anneal.csv'
new_file = "_unit_test.csv"
missing_value_files = ['breast-cancer.csv', 'hepatitis.csv', 'hypothyroid.csv', 'mushroom.csv', 'primary-tumor.csv']

In [5]:
# Train takes a list of instances and returns a 3-tuple containing:
# A dictionary of the class distribution of all classes in the dataset
# A list of dictionaries, tallying each attribute value for every attribute
# A dictionary (for each class) of lists of dictionaries tallying 
# attribute values for every attribute of a particular class
def train(instance_list, missing_value):
    data_info = ({}, [], {})

    current_class = instance_list[0][-1]
    data_info[0][current_class] = 0
    data_info[2][current_class] = []
    class_index = 0
    
    # Add attribute lists to store the unique attribute values in
    for i in range(len(instance_list[0]) - 1):
        data_info[1].append({})
        data_info[2][current_class].append({})
    
    # Tally each value in each attribute for each class
    for data in instance_list:
#         print(data)
        attribute_num = 0

        # New class has been detected
        if data[-1] != current_class:

            # Add data structure to support new class
            current_class = data[-1]
            data_info[0][current_class] = 0
            data_info[2][current_class] = []
            class_index += 1
            
            # Add dictionary for each attribute
            for i in range(len(data) - 1):
                data_info[2][current_class].append({})
        
        # Input each instance's attribute into the appropriate dictionary
        for attribute in data[:-1]:
            
            if attribute in data_info[1][attribute_num]:
                data_info[1][attribute_num][attribute] += 1
            else:
                data_info[1][attribute_num][attribute] = 1
                
            if attribute in data_info[2][current_class][attribute_num]:
                data_info[2][current_class][attribute_num][attribute] += 1
            else:
                data_info[2][current_class][attribute_num][attribute] = 1
            attribute_num += 1
        
        data_info[0][current_class] += 1
        
    return data_info # Return the 3-tuple


# train(preprocess(missing_value_files[0]), "?")
# train(preprocess(new_file), "?")

# train(preprocess(missing_value_files[-1]), "?")

In [6]:
# Predict function takes two arguments, a learner model and a
# dataset and attempts to predict the class of a certain instance
def predict(model, instances):
    predicted_classes = []
    possible_classes = list(model[0].keys())
    
    # Find class prediction for each instance
    for data in instances:
        probability_of_class = 1.0
        class_probabilities = []
        
        # Calculate probability of the instance belonging to a particular class
        for class_name in possible_classes:
            attribute_list = model[2][class_name]
            
            # Multiply all of the attribute probabilities
            for attribute in range(len(attribute_list)):
                if data[attribute] in attribute_list[attribute]:
                    probability_of_class *= (attribute_list[attribute][data[attribute]] + 1) / (len(model[1][attribute].keys()) + model[0][class_name])
                else:
                    probability_of_class /= (len(model[1][attribute].keys()) + model[0][class_name])
            probability_of_class *= model[0][class_name] / sum(model[0].values())
            class_probabilities.append(probability_of_class)
            probability_of_class = 1.0
        
        class_index = 0
        highest_probability = 0
        # Predict the class with the highest probability
        for i in range(len(class_probabilities)):
            if class_probabilities[i] > highest_probability:
                highest_probability = class_probabilities[i]
                class_index = i
        predicted_classes.append(possible_classes[class_index])
        
    return predicted_classes # Return a list of predicted classes

In [7]:
# Evaluates the performance of the predictor model
# The metric/s evaluated are as follows: Accuracy
def evaluate(predictions, dataset):
    tries = 0
    correct = 0
    
    for i in range(len(predictions)):
        if predictions[i] == dataset[i][-1]:
            correct += 1
        tries += 1
        
    print("Correct:", correct, "out of", tries)
    print("Accuracy Rate (%): ", correct / tries * 100)
    print("-----------------------------------")
    return


# for i in missing_value_files:
#     print(i)
#     evaluate(predict(train(preprocess(i), "?"), preprocess(i)), preprocess(i))


In [24]:
# This function should calculate the Information Gain of an attribute or a set of attribute, with respect to the class
def info_gain(model):
    mean_info_attributes = []
    
    # Calculate Entropy of Root Node, a.k.a class distribution entropy
    print(model)
    root_entropy = 0
    for class_name in model[0]:
        pr_attribute = model[0][class_name] * 1.0 / sum(model[0].values())
        root_entropy -= pr_attribute * math.log2(pr_attribute)
    print("Root Entropy:", root_entropy)
    
    # Traverse each attribute in model
    i = 0
    for attribute_index in range(len(model[1])):
        mean_info = 0
        
        
        # Calculate entropy of each attribute's value for each class
        for attribute in model[1][attribute_index]:
            print(model[1][attribute_index])
            print("Rand", i)
            entropy = 0
            attribute_freq = []
            # Append each class' attribute's frequency
            for class_index in model[0].keys():
                if attribute in model[2][class_index][attribute_index]:
                    attribute_freq.append(model[2][class_index][attribute_index][attribute])
            
            # Calculate entropy and add to mean_info
            if len(attribute_freq) == 1:
                continue
            else:
                for element in attribute_freq:
                    probability = element * 1.0 / sum(attribute_freq)
                    entropy -= probability * math.log2(probability)
                    
            mean_info = entropy * model[1][attribute_index][attribute] / sum(model[1][attribute_index].values())

            print("Attribute Frequencies:", attribute_freq)
            print("Attribute:", attribute, "with entropy:", entropy)
            print("Mean Info:", mean_info)
        i += 1
            
            # Add the entropy * probability to the mean_info
        
        # Find Entropy of each attribute's value

    
    # Calculate Information Gain of an attribute given the root node
    # In other words, which attribute is best to split the instances
    return

info_gain(train(preprocess(new_file), "?"))


({'cold': 2, 'flu': 3}, [{'no': 1, 'mild': 2, 'severe': 2}, {'severe': 2, 'no': 1, 'mild': 2}, {'normal': 4, 'high': 1}, {'yes': 4, 'no': 1}], {'cold': [{'no': 1, 'mild': 1}, {'severe': 1, 'no': 1}, {'normal': 2}, {'yes': 1, 'no': 1}], 'flu': [{'severe': 2, 'mild': 1}, {'mild': 2, 'severe': 1}, {'high': 1, 'normal': 2}, {'yes': 3}]})
Root Entropy: 0.9709505944546686
{'no': 1, 'mild': 2, 'severe': 2}
Rand 0
{'no': 1, 'mild': 2, 'severe': 2}
Rand 0
Attribute Frequencies: [1, 1]
Attribute: mild with entropy: 1.0
Mean Info: 0.4
{'no': 1, 'mild': 2, 'severe': 2}
Rand 0
{'severe': 2, 'no': 1, 'mild': 2}
Rand 1
Attribute Frequencies: [1, 1]
Attribute: severe with entropy: 1.0
Mean Info: 0.4
{'severe': 2, 'no': 1, 'mild': 2}
Rand 1
{'severe': 2, 'no': 1, 'mild': 2}
Rand 1
{'normal': 4, 'high': 1}
Rand 2
Attribute Frequencies: [2, 2]
Attribute: normal with entropy: 1.0
Mean Info: 0.8
{'normal': 4, 'high': 1}
Rand 2
{'yes': 4, 'no': 1}
Rand 3
Attribute Frequencies: [1, 3]
Attribute: yes with ent

In [8]:
# # This function should calculate the Information Gain of an attribute or a set of attribute, with respect to the class
# def info_gain(model):
#     attribute_info_gain = []
#     # Calculate the Information Gain of each attribute
#     for attribute_num in range(len(model[2][0])):
#         attribute_totals = []
#         total_value = {}
        
#         # Find all the unique values for a given attribute
#         for class_index in range(len(model[0])):
#             data = model[2][class_index][attribute_num]
            
#             # Adds the class' attribute dictionary to the cumulative dictionary
#             total_value = {x: total_value.get(x, 0) + data.get(x, 0) for x in set(total_value).union(data)}

#         # Delete the 'missing values' totals
#         if "?" in total_value:
#             del total_value["?"]
#         unique_values = set(total_value.keys())
            
#         mean_info = 0
# #         print("---------------")
# #         print(total_value)
# #         print(sum(total_value.values()))
# #         print("---------------")
        
#         attribute_entropy = 0
#         for i in total_value:
#             attribute_probability = total_value[i] * 1.0 / sum(total_value.values())
#             attribute_entropy -= attribute_probability * math.log2(attribute_probability)
        
#         # Calculate entropy for all different values within an attribute
#         for value in unique_values:
#             entropy = 0
#             for class_index in range(len(model[0])):
#                 data = model[2][class_index][attribute_num]
# #                 print(data)
#                 if value in data:
#                     print("---", data[value], "---", total_value[value])
#                     data_probability = data[value] / total_value[value]
#                     entropy -= (data_probability * math.log2(data_probability))
#                     print("Entropy ---", entropy, "---")
#             print("Final Entropy:", entropy)
#             mean_info += total_value[value] * 1.0 / sum(total_value.values()) * entropy
            
# #         print("Mean Info", mean_info)
# #         print("Attribute Entropy", attribute_entropy)
# # #         print("---------------")
#         print("ATTR", attribute_entropy - mean_info)
#         # At this point: Found the Mean Info for an attribute
        
#         # Find entropy of the attribute itself
        

#     return attribute_info_gain
# info_gain(train(preprocess(missing_value_files[0]), "?"))