
### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2020 Semester 1

## Assignment 1: Naive Bayes Classifiers

###### Submission deadline: 7 pm, Monday 20 Apr 2020

**Student Name(s):**    `Pengyu Mu, Ziyuan Xiao`

**Student ID(s):**     `890756, 940448`


This iPython notebook is a template which you will use for your Assignment 1 submission.

Marking will be applied on the four functions that are defined in this notebook, and to your responses to the questions at the end of this notebook (Submitted in a separate PDF file).

**NOTE: YOU SHOULD ADD YOUR RESULTS, DIAGRAMS AND IMAGES FROM YOUR OBSERVATIONS IN THIS FILE TO YOUR REPORT (the PDF file).**

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find.

**Adding proper comments to your code is MANDATORY. **

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from copy import copy, deepcopy

In [2]:
#  Read all datasets 
adult = pd.read_csv("datasets/adult.data", header = None)
bank = pd.read_csv("datasets/bank.data", header = None)
breast_cancer_wisconsin = pd.read_csv("datasets/breast-cancer-wisconsin.data", header = None)
car = pd.read_csv("datasets/car.data", header = None)
lymphography = pd.read_csv("datasets/lymphography.data", header = None)
mushroom = pd.read_csv("datasets/mushroom.data", header = None)
nursery = pd.read_csv("datasets/nursery.data", header = None)
somerville = pd.read_csv("datasets/somerville.data", header = None)
university = pd.read_csv("datasets/university.data", header = None)
wdbc = pd.read_csv("datasets/wdbc.data", header = None)
wine = pd.read_csv("datasets/wine.data", header = None)

In [3]:
#   Helper function that delete all missing values
def delete_missing_value(raw_dataset, missing_values):
    rows = set(raw_dataset[raw_dataset.values == missing_values].index)
    data = raw_dataset.drop(index = rows)
    return data

In [4]:
"""
preprocess dataset incluse delete all missing values and delete ID column from dataset
"""

def preprocess(df, missing_values, id_column = None):
    train = delete_missing_value(df, missing_values)
    if id_column != None:
        train = train.drop(columns = [id_column])
    return train

In [5]:
'''
    Global Variables that help code run take adult dataset as an example
    index column that need to be remove
    Assign None to id_column means no id column
    Edit if change dataset
'''
id_column = None
# Missing Value that should be delete
missing_values = '?'
'''
preprocessing data 
1. delete missing value 

2. delete id column
'''
train_data = preprocess(adult, missing_values,id_column)

'''
    datasets_types can be 
    NOMINAL: NOMINAL ATTRIBUTES DATASETS, 
    NUMERIC: NUMERIC ATTRIBUTES DATASETS, 
    ORDINAL: ORDINAL ATTRIBUTES DATASETS, 
    MIX: DATASETS WITH A MIX OF ATTRIBUTE TYPES
'''

# Enter type of datasets

datasets_types = "MIX"

'''
    0 represents nomianl, 1 represents ordinal, 2 represents numeric
    Only works when dataset_types is MIX
'''
feature_types = [2, 0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 0]
'''
    row of class index
    for example: adult Class: last column (15) 
    class_column should be 14
'''
class_column = 14

In [6]:
#value caculate from global variable 

#use epsilon smooth
EPSILON = 1/(2*train_data.shape[0])

# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
#list of all features columns name
feature_columns = list(train_set.columns)

# Show some useful information about the data
print("Data Shape =====>", train_data.shape)
print("======================================================")
print("EPSILON ========>", EPSILON)
print("======================================================")
print("Some Data Examples:")
display(train_data.head(3))
print("======================================================")
print("Data Type:")
display(train_data.dtypes)

Data Shape =====> (30162, 15)
Some Data Examples:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


Data Type:


0      int64
1     object
2      int64
3     object
4      int64
5     object
6     object
7     object
8     object
9     object
10     int64
11     int64
12     int64
13    object
14    object
dtype: object

In [7]:
#calculate prior
def find_nominal_prob(class_column):
    prior = {}
    total_number = len(class_column)
    classes = class_column.value_counts()
    for key in classes.keys():
        prior[key] = np.log2(classes[key]/total_number)
    return prior

In [8]:
# Calculate prior
print(find_nominal_prob(train_data[class_column]))

{'<=50K': -0.4129662865089241, '>50K': -2.0062315395856487}


In [9]:
def find_nominal_llh(attributes, llh_dict, data, feature_row):
    #calculate nominal llh save llh as a 2d dictionary
    all_feature = data[feature_row].value_counts()
    for attribute in attributes:
        feature_dict = {feature_row:{}}
        subset = data[data[class_column] == attribute][feature_row]
        feature_dict[feature_row] = find_nominal_prob(subset)
        for check_feature in all_feature.keys():
            if check_feature not in feature_dict[feature_row].keys():
                feature_dict[feature_row][check_feature] = np.log2(EPSILON)
        if(llh_dict[attribute]):
            llh_dict[attribute].update(feature_dict)
        else:
            llh_dict[attribute] = feature_dict
            
    return llh_dict

In [10]:
def find_numerical_llh(attributes, llh_dict, data, feature):
#  Save mean and Std as llh
    for attribute in attributes:
        feature_dict = {feature:{}}
        subset = subset = data[data[class_column] == attribute][feature]
        mean = np.mean(subset)
        std = np.std(subset)
        feature_dict[feature]["mean"] = mean
        feature_dict[feature]["std"] = std
        if(llh_dict[attribute]):
            llh_dict[attribute].update(feature_dict)
        else:
            llh_dict[attribute] = feature_dict
    return llh_dict

In [11]:
#  This function should calculat prior probabilities and likelihoods from the training data
#  and using them to build a naive Bayes model

def train(data):
    prior = find_nominal_prob(data[class_column])
    llh_dict = dict.fromkeys(prior)
    for i in feature_columns:
        if(datasets_types == "NUMERIC" or (datasets_types == "MIX" and feature_types[i] == 2)):
            find_numerical_llh(prior, llh_dict, data, i)
        else:
            find_nominal_llh(prior, llh_dict, data, i)
            
    return prior, llh_dict

prior, llh_dict = train(train_data)


In [12]:
'''
This function should predict classes for new items in
a test dataset(for the purposes of this assignment, you
can re-use the training data as a test set)
'''

def predict(instance, prior, llh_dict):
    all_prob = {}
    for attribute in prior.keys():
        all_prob[attribute] = prior[attribute]
        for i in feature_columns:
            prob = 0
            if(datasets_types == "NUMERIC" or (datasets_types == "MIX" and feature_types[i] == 2)):
                mean = max(llh_dict[attribute][i]["mean"], 1 * 10 ** -8)
                std = max(llh_dict[attribute][i]["std"], 1 * 10 ** -8)
                prob = stats.norm.pdf(x=instance[i], loc=mean, scale=std)
                if prob > 0.0:
                    prob = np.log2(prob)
                else:
                    prob = np.log2(1 * 10 ** -8)
            else:
                prob = llh_dict[attribute][i][instance[i]]
            all_prob[attribute] += prob
            
    max_prob = -10000
    max_key = ""
    
    for prob in all_prob.keys():
        if(all_prob[prob] > max_prob):
            max_prob = all_prob[prob]
            max_key = prob
    return max_key

In [13]:
# An example of the prediction working
predict(train_data.iloc[0], prior, llh_dict)

'<=50K'

In [14]:
''' 
This function should evaliate the prediction performance by comparing your
model’s class outputs to ground truth labels
'''
def evaluate(data, prior, llh_dict, interesting_class = None):
    predict_list = []
    correct = 0
    total = data.shape[0]
    result = {}
    TP = 0
    FN = 0
    FP = 0
    TN = 0
    # make confusion matrix
    for index, row in data.iterrows():
        actual_class = row[class_column]
        predict_class = predict(row, prior, llh_dict)
        if (actual_class == predict_class):
            correct += 1
            if interesting_class:
                if(interesting_class == row[class_column]):
                    TP += 1
                elif(interesting_class != row[class_column]):
                    TN += 1
        else:
            if interesting_class:
                if(interesting_class == row[class_column]):
                    FP += 1
                elif(interesting_class != row[class_column]):
                    FN += 1
                    
    correct_rate = correct/total
    result["correct_rate"] = correct_rate
    if (interesting_class):
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        result["precision"] = precision
        result["recall"] = recall
    return result

In [15]:
''' 
A helper fuction for later use
This function could quickly calculate the result
For this assignment dafult train_data is test_data 
'''
def calc_outcome(train_data, test_data = None, interesting_class = None):
    data_train_prior, data_llh_dict = train(train_data)
    if test_data is None:
        test_data = train_data
    data_eval = evaluate(test_data, data_train_prior,data_llh_dict, interesting_class)
    return data_eval

In [16]:
# An example of using calc_outcome set interesting class as "<=50K"
print("======================================================")
print("Overall Accuracy is: ")
print(calc_outcome(train_data, interesting_class = "<=50K"))

Overall Accuracy is: 
{'correct_rate': 0.821928254094556, 'precision': 0.9315794120243666, 'recall': 0.8467001003009027}


## Questions 


If you are in a group of 1, you will respond to question (1), and **one** other of your choosing (two responses in total).

If you are in a group of 2, you will respond to question (1) and question (2), and **two** others of your choosing (four responses in total). 

A response to a question should take about 100–250 words, and make reference to the data wherever possible.

#### NOTE: you may develope codes or functions in respond to the question, but your formal answer should be added to a separate file.

### Q1
Try discretising the numeric attributes in these datasets and treating them as discrete variables in the na¨ıve Bayes classifier. You can use a discretisation method of your choice and group the numeric values into any number of levels (but around 3 to 5 levels would probably be a good starting point). Does discretizing the variables improve classification performance, compared to the Gaussian na¨ıve Bayes approach? Why or why not?

In [17]:
'''
Use adult dataset for this question
index column that need to be remove
Assign None to id_column means no id column
'''
id_column = None

#  Enter dataset here
missing_values = '?'
train_data = preprocess(adult, missing_values,id_column)
'''
    datasets_types can be 
    NOMINAL: NOMINAL ATTRIBUTES DATASETS, 
    NUMERIC: NUMERIC ATTRIBUTES DATASETS, 
    ORDINAL: ORDINAL ATTRIBUTES DATASETS, 
    MIX: DATASETS WITH A MIX OF ATTRIBUTE TYPES
'''

# Enter type of datasets
datasets_types = "MIX"

# 0 represents nomianl, 1 represents ordinal, 2 represents numeric
# Only works when dataset_types is MIX
feature_types = [2, 0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 0]
class_column = 14

EPSILON = 1/(2*train_data.shape[0])

# divide class label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)

feature_columns = list(train_set.columns)


def discretise(data,levels,by):
    '''Given a data set and its corresponding data type,
    All its features must be discretized into discrete features，
    Takes an integer, then determine the total number of discrete feature levels'''

    copy_data = data.copy()
    for i in feature_columns:
        '''Find out all the Numeric Data, and then form a new DataFrame
           Note: 2 means numeric data type'''
        if datasets_types == "NUMERIC" or (datasets_types == "MIX" and feature_types[i] == 2): 
            feature = data[i]
            copy_feature = feature.copy()
            
            # Equal Width
            if by =='width':
                '''
                After finding maximum and minimux,
                calculate the width of each level
                '''
                maximum = np.max(feature)
                minimum = np.min(feature)
                width = (maximum-minimum)/levels
                # Assign the corresponding level label to the corresponding instance 
                for j in range(levels):
                    copy_feature[feature<=(maximum-width*j)] = ('category'+str(levels-j))
           
            # Equal Frequency
            else:
                # Calculate the frequency of each level
                frequency = feature.shape[0]/levels 
                for m in range(levels):
                    # find out the lower bound of index
                    lower = int(frequency*m)
                    # find out the upper bound of index
                    upper = int(frequency*(m+1))
                    
                    # Assign the corresponding level label to the corresponding instance 
                    copy_feature[feature.sort_values()[lower:upper].index] = ('category'+ str(levels-m))
            copy_data[i] = copy_feature
    return copy_data

In [18]:
width_data_level_5 = discretise(train_data,levels=5,by='width')
freq_data_level_5 = discretise(train_data,levels=5,by='freq')

width_data_level_10 = discretise(train_data,levels=10,by='width')
freq_data_level_10 = discretise(train_data,levels=10,by='freq')
print("======================================================")
print("Equal Width:")
display(width_data_level_5.head(3))
print("======================================================")
print("Equal Frequency:")
display(freq_data_level_5.head(3))

Equal Width:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,category2,State-gov,category1,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,category1,category1,category2,United-States,<=50K
1,category3,Self-emp-not-inc,category1,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,category1,category1,category1,United-States,<=50K
2,category2,Private,category1,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,category1,category1,category2,United-States,<=50K


Equal Frequency:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,category3,State-gov,category5,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,category1,category5,category4,United-States,<=50K
1,category2,Self-emp-not-inc,category5,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,category2,category2,category5,United-States,<=50K
2,category3,Private,category2,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,category2,category2,category2,United-States,<=50K


In [19]:
datasets_types = "NOMINAL"
print("Use width discretise with 5 levels")
print(calc_outcome(width_data_level_5))
print("Use width discretise with 10 levels")
print(calc_outcome(width_data_level_10))

Use width discretise with 5 levels
{'correct_rate': 0.8130760559644586}
Use width discretise with 10 levels
{'correct_rate': 0.8181818181818182}


In [20]:
datasets_types = "NOMINAL"
print("Use frequency discretise with 5 levels")
print(calc_outcome(freq_data_level_5))
print("Use frequency discretise with 10 levels")
print(calc_outcome(freq_data_level_10))

Use frequency discretise with 5 levels
{'correct_rate': 0.810257940454877}
Use frequency discretise with 10 levels
{'correct_rate': 0.8137059876666004}


### Q2
Implement a baseline model (e.g., random or 0R) and compare the performance of the na¨ıve Bayes classifier to this baseline on multiple datasets. Discuss why the baseline performance varies across datasets, and to what extent the na¨ıve Bayes classifier improves on the baseline performance.

In [21]:
def zero_r_evaluate(data,label_index):
    temp_data = data.copy()
    # Find out the most class label
    zero_r_result = temp_data[label_index].value_counts().idxmax()
    #Convert to List data type to avoid error
    series_of_class = temp_data[label_index]
    list_of_class = series_of_class.values.tolist()
    
    correct = 0
    total = data.shape[0]

    for i in range(len(list_of_class)):
        #Calculate how many labels of the most type
        if zero_r_result == list_of_class[i]:
            correct += 1
    correct_rate = correct / total
    
    return correct_rate

In [22]:
print("Some 0R baseline model for varius dataset:")

print("======================================================")

print("0R baseline model performance for brest dataset:")
brest_0R_data = preprocess(breast_cancer_wisconsin,"?",0) 
print(zero_r_evaluate(brest_0R_data,10))
print("======================================================")

print("0R baseline model performance for mushroom dataset:")
mushroom_0R_data = preprocess(mushroom,"?") 
print(zero_r_evaluate(mushroom_0R_data,0))
print("======================================================")

print("0R baseline model performance for lymphography dataset:")
lymphography_0R_data = preprocess(lymphography,None) 
print(zero_r_evaluate(lymphography_0R_data,0))
print("======================================================")

print("0R baseline model performance for wdbc dataset:")
wdbc_0R_data = preprocess(wdbc,"?",0) 
print(zero_r_evaluate(wdbc_0R_data,1))
print("======================================================")

print("0R baseline model performance for wine dataset:")
wine_0R_data = preprocess(wine,None) 
print(zero_r_evaluate(wine_0R_data,0))
print("======================================================")

print("0R baseline model performance for car dataset:")
car_0R_data = preprocess(car,None) 
print(zero_r_evaluate(car_0R_data,6))
print("======================================================")

print("0R baseline model performance for nursery dataset:")
nursery_0R_data = preprocess(nursery,None) 
print(zero_r_evaluate(nursery_0R_data,8))
print("======================================================")

print("0R baseline model performance for somerville dataset:")
somerville_0R_data = preprocess(somerville,None) 
print(zero_r_evaluate(somerville_0R_data,0))
print("======================================================")

print("0R baseline model performance for adult dataset:")
adult_0R_data = preprocess(adult,'?') 
print(zero_r_evaluate(adult_0R_data,14))
print("======================================================")

print("0R baseline model performance for bank dataset:")
bank_0R_data = preprocess(bank,None) 
print(zero_r_evaluate(bank_0R_data,14))
print("======================================================")

Some 0R baseline model for varius dataset:
0R baseline model performance for brest dataset:
0.6500732064421669
0R baseline model performance for mushroom dataset:
0.6180014174344437
0R baseline model performance for lymphography dataset:
0.5472972972972973
0R baseline model performance for wdbc dataset:
0.6274165202108963
0R baseline model performance for wine dataset:
0.398876404494382
0R baseline model performance for car dataset:
0.6998264893001735
0R baseline model performance for nursery dataset:
0.3333333333333333
0R baseline model performance for somerville dataset:
0.5384615384615384
0R baseline model performance for adult dataset:
0.7510775147536636
0R baseline model performance for bank dataset:
0.8830151954170445


### Q3
Since it’s difficult to model the probabilities of ordinal data, ordinal attributes are often treated as either nominal variables or numeric variables. Compare these strategies on the ordinal datasets provided. Deterimine which approach gives higher classification accuracy and discuss why.

In [23]:
'''
learn from :https://stackoverflow.com/questions/21818886/changing-ordinal-character-data-to-numeric-data-with-pandas
use dicttionary to convert ordinal value to numeric value
'''
car_origin = pd.read_csv("datasets/car.data", header = None)
car = car_origin.copy()
buying_conv_dict= {'low':0,'med':1,'high':2,'vhigh':3}
car[0]=car[0].apply(buying_conv_dict.get)
maint_conv_dict = {'low':0,'med':1,'high':2,'vhigh':3}
car[1]=car[1].apply(maint_conv_dict.get)
doors_conv_dict = {'2':1, '3':2, '4':3, '5more': 4}
car[2]=car[2].apply(doors_conv_dict.get)
persons_conv_dict = {'2':0, '4':1, 'more':2}
car[3]=car[3].apply(persons_conv_dict.get)
lug_boot_conv_dict = {'small':0,'med':1, 'big':2}
car[4]=car[4].apply(lug_boot_conv_dict.get)
safety_conv_dict = {'low':0 , 'med':1, 'high':2}
car[5]=car[5].apply(safety_conv_dict.get)
class_conv_dict = {'unacc':0 , 'acc':1, 'good':2, 'vgood':3}
car[6]=car[6].apply(class_conv_dict.get)

#use nominal to calculate
datasets_types = "NOMINAL"
class_column = 6
label = car_origin.iloc[:,class_column]
train_set = car_origin.drop(columns=class_column)
feature_columns = list(train_set.columns)
print("when datasets_types is NOMINAL: ")
print(calc_outcome(car_origin))

#use numercial to calculate 
datasets_types = "NUMERIC"
class_column = 6
print("when datasets_types is NUMERIC: ")
print(calc_outcome(car))


when datasets_types is NOMINAL: 
{'correct_rate': 0.8739155581260845}
when datasets_types is NUMERIC: 
{'correct_rate': 0.6350491613649508}


In [24]:
nursery.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
5,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom
6,usual,proper,complete,1,convenient,convenient,problematic,recommended,priority
7,usual,proper,complete,1,convenient,convenient,problematic,priority,priority
8,usual,proper,complete,1,convenient,convenient,problematic,not_recom,not_recom
9,usual,proper,complete,1,convenient,inconv,nonprob,recommended,very_recom


In [25]:
nursery_origin = pd.read_csv("datasets/nursery.data", header = None)

nursery_numeric = nursery_origin.copy()

parents_dict = {'usual':0, 'pretentious':1, 'great_pret':2}
nursery_numeric[0]=nursery_numeric[0].apply(parents_dict.get)
has_nurs_dict ={'proper':0, 'less_proper':1, 'improper':2, 'critical':3, 'very_crit':4}
nursery_numeric[1]=nursery_numeric[1].apply(has_nurs_dict.get)
form_dict = {'complete':0, 'completed':1, 'incomplete':2, 'foster':3}
nursery_numeric[2]=nursery_numeric[2].apply(form_dict.get)
children_dict ={'1':0, '2':1, '3':2, 'more':3}
nursery_numeric[3]=nursery_numeric[3].apply(children_dict.get)
housing_dict = {'convenient':0, 'less_conv':1, 'critical':2}
nursery_numeric[4]=nursery_numeric[4].apply(housing_dict.get)
finance_dict = {'convenient':0, 'inconv':1}
nursery_numeric[5]=nursery_numeric[5].apply(finance_dict.get)
social_dict = {'nonprob':0, 'slightly_prob':1, 'problematic':2}
nursery_numeric[6]=nursery_numeric[6].apply(social_dict.get)
health_dict = {'recommended':0, 'priority':1, 'not_recom':2}
nursery_numeric[7]=nursery_numeric[7].apply(health_dict.get)
class_dict = {'not_recom':0, 'recommend':1, 'very_recom':2, 'priority':3, 'spec_prior':4}
nursery_numeric[8]=nursery_numeric[8].apply(class_dict.get)

#nominal 
datasets_types = "NOMINAL"
class_column = 8
label = nursery_origin.iloc[:,class_column]
train_set = nursery_origin.drop(columns=class_column)
feature_columns = list(train_set.columns)
print("when datasets_types is NOMINAL: ")
print(calc_outcome(nursery_origin))

#numercial 
datasets_types = "NUMERIC"
class_column = 8
print("when datasets_types is NUMERIC: ")
print(calc_outcome(nursery_numeric))

when datasets_types is NOMINAL: 
{'correct_rate': 0.9030864197530865}
when datasets_types is NUMERIC: 
{'correct_rate': 0.6327932098765432}


### Q4
Evaluating the model on the same data that we use to train the model is considered to be a major mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy (you should implement this yourself and do not simply call existing implementations from `scikit-learn`). How does your estimate of effectiveness change, compared to testing on the training data? Explain why. (The result might surprise you!)

In [26]:
#hold_out
def separate_data(data):
    X_train = data.sample(frac = 0.8, random_state = 123)
    X_test  = data.drop(index = X_train.index)
    return X_train,X_test

Use different datasets to test

In [38]:
datasets_types = "NOMINAL"
id_column = 0
class_column = 10
missing_values = '?'
train_data = preprocess(breast_cancer_wisconsin, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

brest_train,brest_test = separate_data(train_data)
print("Accuracy for 80% brest dataset(after hold out)")
print(calc_outcome(brest_train))
print("Accuracy for 20% brest test dataset(after hold out)")
print(calc_outcome(brest_train, test_data = brest_test))
print("======================================================")

Accuracy for 80% brest dataset(after hold out)
{'correct_rate': 0.978021978021978}
Accuracy for 20% brest test dataset(after hold out)
{'correct_rate': 0.9708029197080292}


In [39]:
datasets_types = "NOMINAL"
id_column = None
class_column = 0
missing_values = '?'
train_data = preprocess(mushroom, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

mushroom_train,mushroom_test = separate_data(train_data,)
print("Accuracy for 80% mushroom dataset (after hold out):")
print(calc_outcome(mushroom_train))
print("Accuracy for 20% mushroom test dataset(after hold out)")
print(calc_outcome(mushroom_train, test_data = mushroom_test))
print("======================================================")

Accuracy for 80% mushroom dataset (after hold out):
{'correct_rate': 0.9920265780730897}
Accuracy for 20% mushroom test dataset(after hold out)
{'correct_rate': 0.9911426040744021}


In [40]:
datasets_types = "NOMINAL"
id_column = None
class_column = 0
missing_values = None
train_data = preprocess(lymphography, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

lymphography_train,lymphography_test = separate_data(train_data)
print("Accuracy for 80% lymphography dataset (after hold out):")
print(calc_outcome(lymphography_train))
print("Accuracy for 20% lymphography test dataset (after hold out):")
print(calc_outcome(lymphography_train, test_data = lymphography_test))
print("======================================================")

Accuracy for 80% lymphography dataset (after hold out):
{'correct_rate': 0.8898305084745762}
Accuracy for 20% lymphography test dataset (after hold out):
{'correct_rate': 0.8333333333333334}


In [41]:
datasets_types = "NUMERIC"
id_column = 0
class_column = 1
missing_values = None
train_data = preprocess(wdbc, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

wdbc_train,wdbc_test = separate_data(train_data)
print("Accuracy for 80% wdbc dataset (after hold out):")
print(calc_outcome(wdbc_train))
print("Accuracy for 20% wdbc test dataset (after hold out):")
print(calc_outcome(wdbc_train, test_data = wdbc_test))
print("======================================================")

Accuracy for 80% wdbc dataset (after hold out):
{'correct_rate': 0.9384615384615385}
Accuracy for 20% wdbc test dataset (after hold out):
{'correct_rate': 0.9210526315789473}


In [42]:
datasets_types = "NUMERIC"
id_column = None
class_column = 0
missing_values = None
train_data = preprocess(wine, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

wine_train,wine_test = separate_data(train_data)
print("Accuracy for 80% wine dataset (after hold out):")
print(calc_outcome(wine_train))
print("Accuracy for 20% wine test (after hold out):")
print(calc_outcome(wine_train, test_data = wine_test))
print("======================================================")

Accuracy for 80% wine dataset (after hold out):
{'correct_rate': 0.9929577464788732}
Accuracy for 20% wine test (after hold out):
{'correct_rate': 0.9722222222222222}


In [43]:
datasets_types = "ORDINAL"
id_column = None
class_column = 6
missing_values = None
train_data = preprocess(car, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

car_train,car_test = separate_data(train_data)
print("Accuracy for 80% car dataset(after hold out)")
print(calc_outcome(car_train))
print("Accuracy for 20% car test dataset(after hold out)")
print(calc_outcome(car_train, test_data = car_test))
print("======================================================")

Accuracy for 80% car dataset(after hold out)
{'correct_rate': 0.8705712219812003}
Accuracy for 20% car test dataset(after hold out)
{'correct_rate': 0.8583815028901735}


In [44]:
datasets_types = "ORDINAL"
id_column = None
class_column = 8
missing_values = None
train_data = preprocess(nursery, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

nursery_train,nursery_test = separate_data(train_data)
print("Accuracy for 80% nursery dataset(after hold out)")
print(calc_outcome(nursery_train))
print("Accuracy for 20% nursery test dataset(after hold out)")
print(calc_outcome(nursery_train, test_data = nursery_test))
print("======================================================")

Accuracy for 80% nursery dataset(after hold out)
{'correct_rate': 0.9037422839506173}
Accuracy for 20% nursery test dataset(after hold out)
{'correct_rate': 0.8985339506172839}


In [45]:
datasets_types = "ORDINAL"
id_column = None
class_column = 0
missing_values = None
train_data = preprocess(somerville, missing_values,id_column)
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

somerville_train,somerville_test = separate_data(train_data)
print("Accuracy for 80% somerville dataset(after hold out)")
print(calc_outcome(somerville_train))
print("Accuracy for 20% somerville test dataset(after hold out)")
print(calc_outcome(somerville_train, test_data = somerville_test))
print("======================================================")

Accuracy for 80% somerville dataset(after hold out)
{'correct_rate': 0.7017543859649122}
Accuracy for 20% somerville test dataset(after hold out)
{'correct_rate': 0.5172413793103449}


In [None]:
datasets_types = "MIX"
id_column = None
class_column = 14
missing_values = '?'
train_data = preprocess(adult, missing_values,id_column)
feature_types = [2, 0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 0]
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

adult_train,adult_test = separate_data(train_data)
print("Accuracy for 80% adult dataset(after hold out)")
print(calc_outcome(adult_train))
print("Accuracy for 20% adult test dataset(after hold out)")
print(calc_outcome(adult_train, test_data = adult_test))
print("======================================================")

Accuracy for 80% adult dataset(after hold out)


In [None]:
datasets_types = "MIX"
id_column = None
class_column = 14
missing_values = None
train_data = preprocess(bank, missing_values,id_column)
feature_types = [2, 0, 0, 1, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0]
# Seperate the label and data
label = train_data[class_column]
train_set = train_data.drop(columns=class_column)
feature_columns = list(train_set.columns)
EPSILON = 1/(2*train_data.shape[0])

bank_train,bank_test = separate_data(train_data)
print("Accuracy for 80% bank dataset(after hold out)")
print(calc_outcome(bank_train))
print("Accuracy for 20% bank test dataset(after hold out)")
print(calc_outcome(bank_train, test_data = bank_test))
print("======================================================")