
### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2020 Semester 1

## Assignment 1: Naive Bayes Classifiers

###### Submission deadline: 7 pm, Monday 20 Apr 2020

**Student Name(s):**    `Pengyu Mu, Ziyuan Xiao`

**Student ID(s):**     `890756, 940448`


This iPython notebook is a template which you will use for your Assignment 1 submission.

Marking will be applied on the four functions that are defined in this notebook, and to your responses to the questions at the end of this notebook (Submitted in a separate PDF file).

**NOTE: YOU SHOULD ADD YOUR RESULTS, DIAGRAMS AND IMAGES FROM YOUR OBSERVATIONS IN THIS FILE TO YOUR REPORT (the PDF file).**

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find.

**Adding proper comments to your code is MANDATORY. **

In [1]:
import pandas as pd
import numpy as np
import math
from scipy import stats
from copy import copy, deepcopy

In [2]:
#  Read all datasets 
adult = pd.read_csv("datasets/adult.data", header = None)
bank = pd.read_csv("datasets/bank.data", header = None)
breast_cancer_wisconsin = pd.read_csv("datasets/breast-cancer-wisconsin.data", header = None)
car = pd.read_csv("datasets/car.data", header = None)
lymphography = pd.read_csv("datasets/lymphography.data", header = None)
mushroom = pd.read_csv("datasets/mushroom.data", header = None)
nursery = pd.read_csv("datasets/nursery.data", header = None)
somerville = pd.read_csv("datasets/somerville.data", header = None)
university = pd.read_csv("datasets/university.data", header = None)
wdbc = pd.read_csv("datasets/wdbc.data", header = None)
wine = pd.read_csv("datasets/wine.data", header = None)

In [3]:
#   Helper function that delete all missing values
def delete_missing_value(raw_dataset, missing_values):
    rows = set(raw_dataset[raw_dataset.values == missing_values].index)
    data = raw_dataset.drop(index = rows)
    return data

In [4]:
#    This function should prepare the data by reading it from a file and
#    converting it into a useful format for later training and testing

def preprocess(df, missing_values, id_column = None):
    train = delete_missing_value(df, missing_values)
    if index_column != None:
        train = train.drop(columns = [index_column])
    return train

In [5]:
# Global Variable

# index column that need to be remove
# Assign None to index_column means no id column
id_column = 0
index_column = None
#  Enter dataset here
missing_values = None
train_data = preprocess(wdbc, missing_values,index_column)

'''
    datasets_types can be 
    NOMINAL: NOMINAL ATTRIBUTES DATASETS, 
    NUMERIC: NUMERIC ATTRIBUTES DATASETS, 
    ORDINAL: ORDINAL ATTRIBUTES DATASETS, 
    MIX: DATASETS WITH A MIX OF ATTRIBUTE TYPES
'''
# Enter type of datasets

datasets_types = "NUMERIC"

# 0 represents nomianl, 1 represents ordinal, 2 represents numeric
# Only works when dataset_types equals MIX
feature_types = [2, 0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 0]
class_column = 1

EPSILON = 1/(2*train_data.shape[0])

# Seperate the label and data
label = train_data.iloc[:,class_column]
train_set = train_data.drop(columns=class_column)

feature_columns = list(train_set.columns)

# Show some useful information about the data
print("Data Shape =====>", train_data.shape)
print("======================================================")
print("EPSILON ========>", EPSILON)
print("======================================================")
print("Some Data Examples:")
display(train_data.head(3))
print("======================================================")
print("Data Type:")
display(train_data.dtypes)

Data Shape =====> (569, 32)
Some Data Examples:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


Data Type:


0       int64
1      object
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
dtype: object

In [6]:
#calculate prior
def find_nominal_prob(class_column):
    prior = {}
    total_number = len(class_column)
    classes = class_column.value_counts()
    for key in classes.keys():
        prior[key] = math.log2(classes[key]/total_number)
    return prior

In [7]:
# Calculate prior
print(find_nominal_prob(train_data[class_column]))

{'B': -0.6725045782774821, 'M': -1.424364387743383}


In [8]:
def find_nominal_llh(attributes, llh_dict, data, feature_row):
    all_feature = data[feature_row].value_counts()
    for attribute in attributes:
        feature_dict = {feature_row:{}}
        subset = data[data[class_column] == attribute][feature_row]
        feature_dict[feature_row] = find_nominal_prob(subset)
        for check_feature in all_feature.keys():
            if check_feature not in feature_dict[feature_row].keys():
                feature_dict[feature_row][check_feature] = np.log2(EPSILON)
        if(llh_dict[attribute]):
            llh_dict[attribute].update(feature_dict)
        else:
            llh_dict[attribute] = feature_dict
            
    return llh_dict

In [9]:
def find_numerical_llh(attributes, llh_dict, data, feature):
#  Save mean and Std as llh
    for attribute in attributes:
        feature_dict = {feature:{}}
        subset = subset = data[data[class_column] == attribute][feature]
        mean = np.mean(subset)
        std = np.std(subset)
        feature_dict[feature]["mean"] = mean
        feature_dict[feature]["std"] = std
        if(llh_dict[attribute]):
            llh_dict[attribute].update(feature_dict)
        else:
            llh_dict[attribute] = feature_dict
    return llh_dict

In [10]:
#  This function should calculat prior probabilities and likelihoods from the training data
#  and using them to build a naive Bayes model

def train(data):
    prior = find_nominal_prob(data[class_column])
    llh_dict = dict.fromkeys(prior)
    for i in feature_columns:
        if(datasets_types == "NUMERIC" or (datasets_types == "MIX" and feature_types[i] == 2)):
            find_numerical_llh(prior, llh_dict, data, i)
        else:
            find_nominal_llh(prior, llh_dict, data, i)
            
    return prior, llh_dict

prior, llh_dict = train(train_data)


In [11]:
# This function should predict classes for new items in a test dataset (for the purposes of this assignment, you
# can re-use the training data as a test set)

def predict(instance, prior, llh_dict):
    all_prob = {}
    for attribute in prior.keys():
        all_prob[attribute] = prior[attribute]
        for i in feature_columns:
            prob = 0
            if(datasets_types == "NUMERIC" or (datasets_types == "MIX" and feature_types[i] == 2)):
                mean = max(llh_dict[attribute][i]["mean"], 1 * 10 ** -8)
                std = max(llh_dict[attribute][i]["std"], 1 * 10 ** -8)
                prob = stats.norm.pdf(x=instance[i], loc=mean, scale=std)
                if prob > 0.0:
                    prob = np.log2(prob)
                else:
                    prob = np.log2(1 * 10 ** -8)
            else:
                prob = llh_dict[attribute][i][instance[i]]
            all_prob[attribute] += prob
    max_prob = -10000
    max_key = ""
    
    for prob in all_prob.keys():
        if(all_prob[prob] > max_prob):
            max_prob = all_prob[prob]
            max_key = prob
    return max_key

In [12]:
# An example of the prediction working
predict(train_data.iloc[0], prior, llh_dict)

'M'

In [13]:
# This function should evaliate the prediction performance by comparing your model’s class outputs to ground
# truth labels

def evaluate(data, prior, llh_dict, interesting_class = None):
    predict_list = []
    correct = 0
    total = data.shape[0]
    result = {}
    TP = 0
    FN = 0
    FP = 0
    TN = 0
    
    for index, row in data.iterrows():
        actual_class = row[class_column]
        predict_class = predict(row, prior, llh_dict)
        if (actual_class == predict_class):
            correct += 1
            if interesting_class:
                if(interesting_class == row[class_column]):
                    TP += 1
                elif(interesting_class != row[class_column]):
                    TN += 1
        else:
            if interesting_class:
                if(interesting_class == row[class_column]):
                    FP += 1
                elif(interesting_class != row[class_column]):
                    FN += 1
                    
    correct_rate = correct/total
    result["correct_rate"] = correct_rate
    
    if (interesting_class):
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        result["precision"] = precision
        result["recall"] = recall
    print("correct",correct,total)
    return result
# %time
print("======================================================")
print("Overall Accuracy is: ")
print(evaluate(train_data, prior, llh_dict))

Overall Accuracy is: 
correct 533 569
{'correct_rate': 0.9367311072056239}


In [14]:
# A helper fuction for later use
# This function could quickly calculate the result
def calc_outcome(data):
    data_train,data_llh_dict = train(data)
    data_eval = evaluate(data,data_train,data_llh_dict)
    return data_eval

In [15]:
# An example of using calc_outcome
calc_outcome(train_data)

correct 533 569


{'correct_rate': 0.9367311072056239}

## Questions 


If you are in a group of 1, you will respond to question (1), and **one** other of your choosing (two responses in total).

If you are in a group of 2, you will respond to question (1) and question (2), and **two** others of your choosing (four responses in total). 

A response to a question should take about 100–250 words, and make reference to the data wherever possible.

#### NOTE: you may develope codes or functions in respond to the question, but your formal answer should be added to a separate file.

### Q1
Try discretising the numeric attributes in these datasets and treating them as discrete variables in the na¨ıve Bayes classifier. You can use a discretisation method of your choice and group the numeric values into any number of levels (but around 3 to 5 levels would probably be a good starting point). Does discretizing the variables improve classification performance, compared to the Gaussian na¨ıve Bayes approach? Why or why not?

In [23]:
#  Use adult dataset for this question

# index column that need to be remove
# Assign None to index_column means no id column
id_column = None

#  Enter dataset here
missing_values = None
train_data = preprocess(wine, missing_values,index_column)

'''
    datasets_types can be 
    NOMINAL: NOMINAL ATTRIBUTES DATASETS, 
    NUMERIC: NUMERIC ATTRIBUTES DATASETS, 
    ORDINAL: ORDINAL ATTRIBUTES DATASETS, 
    MIX: DATASETS WITH A MIX OF ATTRIBUTE TYPES
'''
# Enter type of datasets

datasets_types = "NUMERIC"

# 0 represents nomianl, 1 represents ordinal, 2 represents numeric
# Only works when dataset_types is MIX
feature_types = []
class_column = 0

EPSILON = 1/(2*train_data.shape[0])

# 分开label和数据
label = train_data.iloc[:,class_column]
train_set = train_data.drop(columns=class_column)

feature_columns = list(train_set.columns)


def discretise(data,levels,by):
    #给定一个数据集及其对应的数据类型，
    #必须将其所有数字特征离散为离散特征，
    #'宽度' & '频率'
    #代表等宽或者等频率并且取一个可靠的整数，
    #然后确定离散后的特征级别的总数
    copy_data = data.copy()
    for i in feature_columns:
        # 找出所有的 Numeric Data,然后组成新的DataFrame
        # Note: 2 means numeric data type
        if datasets_types == "NUMERIC" or (datasets_types == "MIX" and data_type[i] == 2): # continuous
            feature = data.iloc[:,i]
            copy_feature = feature.copy()
            
            # Equal Width
            if by =='width':
                # calculate width of each level
                maximum = np.max(feature)
                minimum = np.min(feature)
                width = (maximum-minimum)/levels # width of each level
                for j in range(levels):
                    copy_feature[feature<=(maximum-width*j)] = ('level'+str(levels-j))
           
            # Equal Frequency
            else:
                frequency = feature.shape[0]/levels # frequency of each level
                for m in range(levels):
                    lower = int(frequency*m) # lower bound of index, so has to be integer
                    upper = int(frequency*(m+1)) # upper bound of index, so has to be integer
                    copy_feature[feature.sort_values()[lower:upper].index] = ('level'+str(levels-m))
            copy_data.iloc[:,i] = copy_feature
    return copy_data

In [24]:
width_data = discretise(train_data,levels=3,by='width')
freq_data = discretise(train_data,levels=10,by='freq')
print("======================================================")
print("Equal Width:")
display(width_data)
print("======================================================")
print("Equal Frequency:")
display(freq_data)

Equal Width:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,level3,level1,level2,level1,level2,level2,level2,level1,level2,level2,level2,level3,level2
1,1,level2,level1,level2,level1,level1,level2,level2,level1,level1,level1,level2,level3,level2
2,1,level2,level1,level3,level2,level2,level2,level2,level1,level3,level2,level2,level3,level2
3,1,level3,level1,level2,level1,level2,level3,level2,level1,level2,level2,level1,level3,level3
4,1,level2,level2,level3,level2,level2,level2,level2,level2,level2,level1,level2,level2,level1
5,1,level3,level1,level2,level1,level2,level3,level2,level2,level2,level2,level2,level2,level3
6,1,level3,level1,level2,level1,level1,level2,level2,level1,level2,level2,level2,level3,level3
7,1,level3,level1,level3,level2,level2,level2,level2,level2,level1,level1,level2,level3,level3
8,1,level3,level1,level2,level1,level1,level2,level2,level1,level2,level2,level2,level2,level2
9,1,level3,level1,level2,level1,level1,level3,level2,level1,level2,level2,level2,level3,level2


Equal Frequency:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,level1,level7,level4,level10,level1,level3,level2,level7,level2,level4,level5,level1,level2
1,1,level5,level6,level9,level10,level5,level4,level3,level9,level8,level6,level4,level2,level2
2,1,level5,level4,level2,level6,level5,level3,level1,level6,level1,level4,level5,level3,level2
3,1,level1,level5,level4,level8,level2,level1,level1,level9,level2,level2,level7,level2,level1
4,1,level5,level4,level1,level4,level1,level3,level3,level4,level4,level6,level5,level4,level5
5,1,level1,level6,level4,level10,level2,level1,level1,level6,level3,level3,level4,level5,level1
6,1,level1,level5,level4,level10,level6,level5,level4,level7,level3,level5,level5,level1,level1
7,1,level2,level4,level2,level8,level1,level4,level4,level6,level8,level5,level4,level1,level1
8,1,level1,level8,level8,level10,level6,level3,level2,level7,level3,level5,level4,level5,level3
9,1,level2,level9,level7,level9,level5,level2,level2,level9,level4,level2,level5,level1,level3


In [25]:
display(width_data.head(5))
freq_data[4].value_counts()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,level3,level1,level2,level1,level2,level2,level2,level1,level2,level2,level2,level3,level2
1,1,level2,level1,level2,level1,level1,level2,level2,level1,level1,level1,level2,level3,level2
2,1,level2,level1,level3,level2,level2,level2,level2,level1,level3,level2,level2,level3,level2
3,1,level3,level1,level2,level1,level2,level3,level2,level1,level2,level2,level1,level3,level3
4,1,level2,level2,level3,level2,level2,level2,level2,level2,level2,level1,level2,level2,level1


level4     18
level9     18
level6     18
level1     18
level8     18
level3     18
level2     18
level7     18
level10    17
level5     17
Name: 4, dtype: int64

In [26]:
# feature_types = discrete_datatype
datasets_types = "NOMINAL"
calc_outcome(width_data)

correct 172 178


{'correct_rate': 0.9662921348314607}

In [27]:
calc_outcome(freq_data)

correct 176 178


{'correct_rate': 0.9887640449438202}

### Q2
Implement a baseline model (e.g., random or 0R) and compare the performance of the na¨ıve Bayes classifier to this baseline on multiple datasets. Discuss why the baseline performance varies across datasets, and to what extent the na¨ıve Bayes classifier improves on the baseline performance.

In [28]:
def zero_r_evaluate(data,label_index):
    temp_data = data.copy()
    zero_r_result = temp_data[label_index].value_counts().idxmax()

    series_of_class = temp_data[label_index]
    list_of_class = series_of_class.values.tolist()
    
    correct = 0
    total = data.shape[0]

    for i in range(len(list_of_class)):
        if zero_r_result == list_of_class[i]:
            correct += 1
    correct_rate = correct / total
    
    return correct_rate

In [29]:
print("Some 0R baseline model for varius dataset:")

print()

print("0R baseline model performance for ADULT dataset:")
adult_0R_data = preprocess(adult,"?") 
print(zero_r_evaluate(adult_0R_data,14))
print("======================================================")

print("0R baseline model performance for MUSHROOM dataset:")
mushroom_0R_data = preprocess(mushroom,"?") 
print(zero_r_evaluate(mushroom_0R_data,14))
print("======================================================")

print("0R baseline model performance for BANK dataset:")
bank_0R_data = preprocess(bank,None) 
print(zero_r_evaluate(bank_0R_data,14))
print("======================================================")

print("0R baseline model performance for WINE dataset:")
wine_0R_data = preprocess(wine,None) 
print(zero_r_evaluate(wine_0R_data,0))
print("======================================================")

Some 0R baseline model for varius dataset:

0R baseline model performance for ADULT dataset:
0.7510775147536636
0R baseline model performance for MUSHROOM dataset:
0.5556343019135365
0R baseline model performance for BANK dataset:
0.8830151954170445
0R baseline model performance for WINE dataset:
0.398876404494382


### Q3
Since it’s difficult to model the probabilities of ordinal data, ordinal attributes are often treated as either nominal variables or numeric variables. Compare these strategies on the ordinal datasets provided. Deterimine which approach gives higher classification accuracy and discuss why.

In [30]:
# use car dataset for this question
car.head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [31]:
# https://stackoverflow.com/questions/21818886/changing-ordinal-character-data-to-numeric-data-with-pandas
car_origin = pd.read_csv("datasets/car.data", header = None)
car = car_origin.copy()
buying_conv_dict= {'low':0,'med':1,'high':2,'vhigh':3}
car[0]=car[0].apply(buying_conv_dict.get)
maint_conv_dict = {'low':0,'med':1,'high':2,'vhigh':3}
car[1]=car[1].apply(maint_conv_dict.get)
doors_conv_dict = {'2':1, '3':2, '4':3, '5more': 4}
car[2]=car[2].apply(doors_conv_dict.get)
persons_conv_dict = {'2':0, '4':1, 'more':2}
car[3]=car[3].apply(persons_conv_dict.get)
lug_boot_conv_dict = {'small':0,'med':1, 'big':2}
car[4]=car[4].apply(lug_boot_conv_dict.get)
safety_conv_dict = {'low':0 , 'med':1, 'high':2}
car[5]=car[5].apply(safety_conv_dict.get)
class_conv_dict = {'unacc':0 , 'acc':1, 'good':2, 'vgood':3}
car[6]=car[6].apply(class_conv_dict.get)

#nominal 
datasets_types = "NOMINAL"
class_column = 6
label = car_origin.iloc[:,class_column]
train_set = car_origin.drop(columns=class_column)
feature_columns = list(train_set.columns)
prior, llh_dict = train(car_origin)
print("when datasets_types is NOMINAL: ")
print(evaluate(car_origin, prior, llh_dict))

#numercial 
datasets_types = "NUMERIC"
class_column = 6
prior, llh_dict = train(car)
print("when datasets_types is NUMERIC: ")

print(evaluate(car, prior, llh_dict))


when datasets_types is NOMINAL: 
correct 1511 1729
{'correct_rate': 0.8739155581260845}
when datasets_types is NUMERIC: 
correct 1098 1729
{'correct_rate': 0.6350491613649508}


In [None]:
nursery.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
5,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom
6,usual,proper,complete,1,convenient,convenient,problematic,recommended,priority
7,usual,proper,complete,1,convenient,convenient,problematic,priority,priority
8,usual,proper,complete,1,convenient,convenient,problematic,not_recom,not_recom
9,usual,proper,complete,1,convenient,inconv,nonprob,recommended,very_recom


In [None]:
nursery_origin = pd.read_csv("datasets/nursery.data", header = None)

nursery_numeric = nursery_origin.copy()

parents_dict = {'usual':0, 'pretentious':1, 'great_pret':2}
nursery_numeric[0]=nursery_numeric[0].apply(parents_dict.get)
has_nurs_dict ={'proper':0, 'less_proper':1, 'improper':2, 'critical':3, 'very_crit':4}
nursery_numeric[1]=nursery_numeric[1].apply(has_nurs_dict.get)
form_dict = {'complete':0, 'completed':1, 'incomplete':2, 'foster':3}
nursery_numeric[2]=nursery_numeric[2].apply(form_dict.get)
children_dict ={'1':0, '2':1, '3':2, 'more':3}
nursery_numeric[3]=nursery_numeric[3].apply(children_dict.get)
housing_dict = {'convenient':0, 'less_conv':1, 'critical':2}
nursery_numeric[4]=nursery_numeric[4].apply(housing_dict.get)
finance_dict = {'convenient':0, 'inconv':1}
nursery_numeric[5]=nursery_numeric[5].apply(finance_dict.get)
social_dict = {'nonprob':0, 'slightly_prob':1, 'problematic':2}
nursery_numeric[6]=nursery_numeric[6].apply(social_dict.get)
health_dict = {'recommended':0, 'priority':1, 'not_recom':2}
nursery_numeric[7]=nursery_numeric[7].apply(health_dict.get)
class_dict = {'not_recom':0, 'recommend':1, 'very_recom':2, 'priority':3, 'spec_prior':4}
nursery_numeric[8]=nursery_numeric[8].apply(class_dict.get)

#nominal 
datasets_types = "NOMINAL"
class_column = 8
label = nursery_origin.iloc[:,class_column]
train_set = nursery_origin.drop(columns=class_column)
feature_columns = list(train_set.columns)
prior, llh_dict = train(nursery_origin)
print("when datasets_types is NOMINAL: ")

print(evaluate(nursery_origin, prior, llh_dict))

#numercial 
datasets_types = "NUMERIC"
class_column = 8
prior, llh_dict = train(nursery_numeric)
print("when datasets_types is NUMERIC: ")

print(evaluate(nursery_numeric, prior, llh_dict))

when datasets_types is NOMINAL: 


### Q4
Evaluating the model on the same data that we use to train the model is considered to be a major mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy (you should implement this yourself and do not simply call existing implementations from `scikit-learn`). How does your estimate of effectiveness change, compared to testing on the training data? Explain why. (The result might surprise you!)

In [None]:
#hold_out
def separate_data(data,missing_values):
    data = preprocess(data,missing_values) 
    X_train = data.sample(frac = 0.8, random_state = 123)
    X_test  = data.drop(index = X_train.index)
    return X_train,X_test

In [None]:
def hold_out_outcome(data):
    data_train,data_llh_dict = train(data)
    data_eval = evaluate(data_train,data_llh_dict)
    return data_eval

In [None]:
adult_train,adult_test = separate_data(adult,'?')
adult_train.head()


In [None]:
adult_prior,adult_llh_dict=train(adult_train)
audlt_eval = evaluate(adult_train,adult_prior,adult_llh_dict)