# Download Data

In [16]:
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

api.dataset_download_files("fredericobreno/play-tennis", path="data/")


# Load Data

In [22]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

zip_file = ZipFile("data/play-tennis.zip")
zip_file.namelist()

['play_tennis.csv']

In [3]:
data = pd.read_csv(zip_file.open(zip_file.namelist()[0]), index_col=0)
data

Unnamed: 0_level_0,outlook,temp,humidity,wind,play
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Sunny,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes
D10,Rain,Mild,Normal,Weak,Yes


# Naive Bayes

[Reference](http://shatterline.com/blog/2013/09/12/not-so-naive-classification-with-the-naive-bayes-classifier/)

In [53]:
query = dict(outlook="Sunny", temp="Cool", humidity="High", wind="Strong")
query

{'outlook': 'Sunny', 'temp': 'Cool', 'humidity': 'High', 'wind': 'Strong'}

In [72]:
def naive_bayes(data, target, query):
    prior_prob = data[target].value_counts(normalize=True)
    likehood = {}
    for col in data.columns.drop(target):
        likehood[col] = data.groupby(target)[col].value_counts(normalize=True)

    posterior_prob = prior_prob.to_dict()

    for k, v in query.items():
        for cls in posterior_prob:
            posterior_prob[cls] = posterior_prob[cls] * likehood[k][cls][v]

    print(posterior_prob)



In [10]:
naive_bayes(data, "play", query)

NameError: name 'naive_bayes' is not defined

Since P(ClassPlay=Yes|x’) less than P(ClassPlay=No|x’), we classify the new instance x’ to be “No”

In [11]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [12]:
clf = MultinomialNB()
X = data.drop("play", axis=1)
y = data["play"]
X

Unnamed: 0_level_0,outlook,temp,humidity,wind
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D1,Sunny,Hot,High,Weak
D2,Sunny,Hot,High,Strong
D3,Overcast,Hot,High,Weak
D4,Rain,Mild,High,Weak
D5,Rain,Cool,Normal,Weak
D6,Rain,Cool,Normal,Strong
D7,Overcast,Cool,Normal,Strong
D8,Sunny,Mild,High,Weak
D9,Sunny,Cool,Normal,Weak
D10,Rain,Mild,Normal,Weak


In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
X_label = X.apply(LabelEncoder().fit_transform)
X_label

Unnamed: 0_level_0,outlook,temp,humidity,wind
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D1,2,1,0,1
D2,2,1,0,0
D3,0,1,0,1
D4,1,2,0,1
D5,1,0,1,1
D6,1,0,1,0
D7,0,0,1,0
D8,2,2,0,1
D9,2,0,1,1
D10,1,2,1,1


In [15]:
clf.fit(X_label, y)

In [17]:
clf.predict_proba([[2, 0, 0, 0]])



array([[0.63454143, 0.36545857]])

In [60]:
clf.feature_names_in_

array(['outlook', 'temp', 'humidity', 'wind'], dtype=object)

In [18]:
clf.class_log_prior_

array([-1.02961942, -0.44183275])

In [19]:
clf.feature_log_prob_

array([[-0.84729786, -1.09861229, -2.35137526, -1.94591015],
       [-1.41706602, -1.09861229, -1.55059741, -1.55059741]])

In [23]:
np.log(y.value_counts())

Yes    2.197225
No     1.609438
Name: play, dtype: float64

In [24]:
clf.class_count_, clf.feature_count_

(array([5., 9.]),
 array([[ 8.,  6.,  1.,  2.],
        [ 7., 10.,  6.,  6.]]))

In [26]:
clf.classes_, clf.feature_names_in_

(array(['No', 'Yes'], dtype='<U3'),
 array(['outlook', 'temp', 'humidity', 'wind'], dtype=object))

In [42]:
(clf.feature_count_ + 1).sum(axis=1)

array([21., 33.])

In [46]:
np.log(clf.class_count_)

array([1.60943791, 2.19722458])

In [48]:
np.log(clf.class_count_.sum())

2.6390573296152584

In [49]:
n_classes = len(clf.classes_)
np.full(n_classes, -np.log(n_classes))

array([-0.69314718, -0.69314718])

In [51]:
np.full(n_classes, 1)

array([1, 1])