In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.naive_bayes import GaussianNB

import sys
print(sys.version)

golf_file = "golf.csv"

# Open the file for reading and read in data
golf_file_handler = open(golf_file, "r")
golf_data = pd.read_csv(golf_file_handler, sep=";")
golf_file_handler.close()

golf_data.head(100)

3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]


Unnamed: 0,Outlook,Temperature,Temperature_Category,Humidity,Humidity_Category,Windy,Play
0,overcast,83,hot,86,high,False,yes
1,overcast,64,cool,65,normal,True,yes
2,overcast,72,mild,90,high,True,yes
3,overcast,81,hot,75,normal,False,yes
4,rainy,70,mild,96,high,False,yes
5,rainy,68,cool,80,normal,False,yes
6,rainy,65,cool,70,normal,True,no
7,rainy,75,mild,80,normal,False,yes
8,rainy,71,mild,91,high,True,no
9,sunny,85,hot,85,high,False,no


In [2]:
# Data cleaning, we replace the strings with numbers
d = {'sunny': 1, 'overcast': 2, 'rainy': 3}
golf_data.Outlook = [d[item] for item in golf_data.Outlook.astype(str)]
golf_data.head(100)

Unnamed: 0,Outlook,Temperature,Temperature_Category,Humidity,Humidity_Category,Windy,Play
0,2,83,hot,86,high,False,yes
1,2,64,cool,65,normal,True,yes
2,2,72,mild,90,high,True,yes
3,2,81,hot,75,normal,False,yes
4,3,70,mild,96,high,False,yes
5,3,68,cool,80,normal,False,yes
6,3,65,cool,70,normal,True,no
7,3,75,mild,80,normal,False,yes
8,3,71,mild,91,high,True,no
9,1,85,hot,85,high,False,no


In [3]:
# Split the datatset into 70% train and 30% test data
train, test = train_test_split(golf_data,test_size=0.3, random_state=0)

# We need to reset the index, so that we can the test set use independently from the train dataset
test=test.reset_index(drop=True)

# initialise Gaussian Naive Bayes
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html 
naive_b = GaussianNB()

# lets take a look in the test set
print(test[['Outlook','Temperature','Humidity','Windy','Play']])

   Outlook  Temperature  Humidity  Windy Play
0        3           71        91   True   no
1        3           65        70   True   no
2        3           70        96  False  yes
3        1           72        95  False   no
4        2           72        90   True  yes


In [4]:
# Train the naive bayes model with the parameters and the classification to learn from
naive_b.fit(train[['Outlook','Temperature','Humidity','Windy']], train[['Play']])

# build a dataframe to show the expected vs predicted values
prediction_array=naive_b.predict(test[['Outlook','Temperature','Humidity','Windy']])
prediction_dataframe=pd.DataFrame(data=prediction_array, columns=["prediction"])
prediction_dataframe['result']=test['Play']

print (prediction_dataframe)

# Use the score function and output the prediction accuracy
print ("Naive Bayes Accuracy:", naive_b.score(test[['Outlook','Temperature','Humidity','Windy']], test[['Play']]))

  prediction result
0        yes     no
1        yes     no
2        yes    yes
3         no     no
4        yes    yes
Naive Bayes Accuracy: 0.6


  y = column_or_1d(y, warn=True)


In [5]:
# Evaluation
import math

# Confusion Matrix
def confusion_matrix(predicted, truth):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for i in range(0, len(predicted)):
        if (predicted[i] == 'yes' and truth[i] == 'yes'):
            true_positive += 1
        elif (predicted[i] == 'no' and truth[i] == 'no'):
            true_negative += 1
        elif (predicted[i] == 'yes' and truth[i] == 'no'):
            false_positive += 1
        elif (predicted[i] == 'no' and truth[i] == 'yes'):
            false_negative += 1

    return true_positive, true_negative, false_positive, false_negative

TP, TN, FP, FN = confusion_matrix(prediction_dataframe['prediction'], prediction_dataframe['result'])

print('True Positive:  '  + str(TP))
print('True Negative:  '  + str(TN))
print('False Positive: '  + str(FP))
print('False Negative: '  + str(FN))

# TPR - 1 if we never miss a playable game (yey!)
TPR=TP/(TP+FN)
print('TPR / Sensitivity:  '  + str(TPR))

# TNR - low if we stay in the rain a lot (buuh)
TNR=TN/(TN+FP)
print('TNR / Specificity:  '  + str(TNR))

# Accuracy
Acc=(TP+TN)/(TP+TN+FP+FN)
print ("Accuracy:", str(Acc))

# PPR
PPR=TP/(TP+FP)
print('PPR:  '  + str(PPR))

# NPR
NPR=TN/(FN+TN)
print('NPR:  '  + str(NPR))

# F-Score
F=(2*TP)/(2*TP+FP+FN)
print('F:  '  + str(F))

# BM 
print('BM:  '  + str(TPR+TNR-1))

# MK
print('MK:  '  + str(PPR+NPR-1))

# MCC
MCC=(TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
print('MCC:  '  + str(MCC))

True Positive:  2
True Negative:  1
False Positive: 2
False Negative: 0
TPR / Sensitivity:  1.0
TNR / Specificity:  0.3333333333333333
Accuracy: 0.6
PPR:  0.5
NPR:  1.0
F:  0.6666666666666666
BM:  0.33333333333333326
MK:  0.5
MCC:  0.4082482904638631
