In [80]:
import numpy as np
import pandas as pd

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/golf.csv')

In [83]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [84]:
def priorProbability(data, target):
  n = data.shape[0]
  n_class = (data['Play'] == target).sum()
  return n_class/n

In [85]:
prior = {
    'yes' : priorProbability(data,'yes'),
    'no' : priorProbability(data,'no')
}
prior

{'yes': 0.6428571428571429, 'no': 0.35714285714285715}

In [86]:
def conditionalProbability(data, X, X_val, target):
  filtered_data = data[data['Play'] == target]
  n = filtered_data.shape[0]
  # n_X = np.sum(filtered_data[X] == X_val)
  n_X = (filtered_data[X] == X_val).sum()
  return n_X/n

In [87]:
conditionalProbability(data, 'Humidity', 'normal', 'no')

0.2

In [88]:
cond_prob = {}
features = list(data.columns)[:-1]
features

['Outlook', 'Temperature', 'Humidity', 'Windy']

In [89]:
def featureVals(data, feature):
  feature_val = data[feature].unique()
  print(feature_val)

In [90]:
featureVals(data, 'Outlook')

['sunny' 'overcast' 'rainy']


In [91]:
data['Play'].unique()

array(['no', 'yes'], dtype=object)

In [92]:
def conditionalProbabilityAll(data):
  features = list(data.columns)[:-1]
  cond_prob = {}

  for target in data['Play'].unique():
    cond_prob[target] = {}

    for feature in features:
      cond_prob[target][feature] = {}
      feature_val = data[feature].unique()

      for val in feature_val:
        cp = round(conditionalProbability(data, feature, val, target), 2)
        cond_prob[target][feature][val] = cp
        print(target, feature, val, cp)
    print()

  return cond_prob

In [93]:
cps = conditionalProbabilityAll(data)

no Outlook sunny 0.6
no Outlook overcast 0.0
no Outlook rainy 0.4
no Temperature hot 0.4
no Temperature mild 0.4
no Temperature cool 0.2
no Humidity high 0.8
no Humidity normal 0.2
no Windy False 0.4
no Windy True 0.6

yes Outlook sunny 0.22
yes Outlook overcast 0.44
yes Outlook rainy 0.33
yes Temperature hot 0.22
yes Temperature mild 0.44
yes Temperature cool 0.33
yes Humidity high 0.33
yes Humidity normal 0.67
yes Windy False 0.67
yes Windy True 0.33



In [94]:
print(cps)

{'no': {'Outlook': {'sunny': 0.6, 'overcast': 0.0, 'rainy': 0.4}, 'Temperature': {'hot': 0.4, 'mild': 0.4, 'cool': 0.2}, 'Humidity': {'high': 0.8, 'normal': 0.2}, 'Windy': {False: 0.4, True: 0.6}}, 'yes': {'Outlook': {'sunny': 0.22, 'overcast': 0.44, 'rainy': 0.33}, 'Temperature': {'hot': 0.22, 'mild': 0.44, 'cool': 0.33}, 'Humidity': {'high': 0.33, 'normal': 0.67}, 'Windy': {False: 0.67, True: 0.33}}}


In [95]:
import pprint

In [96]:
pprint.pprint(cps, sort_dicts=False)

{'no': {'Outlook': {'sunny': 0.6, 'overcast': 0.0, 'rainy': 0.4},
        'Temperature': {'hot': 0.4, 'mild': 0.4, 'cool': 0.2},
        'Humidity': {'high': 0.8, 'normal': 0.2},
        'Windy': {False: 0.4, True: 0.6}},
 'yes': {'Outlook': {'sunny': 0.22, 'overcast': 0.44, 'rainy': 0.33},
         'Temperature': {'hot': 0.22, 'mild': 0.44, 'cool': 0.33},
         'Humidity': {'high': 0.33, 'normal': 0.67},
         'Windy': {False: 0.67, True: 0.33}}}


In [97]:
Xt = ['sunny', 'hot', 'normal', False]

In [98]:
def predict(data, Xt):
  features = list(data.columns)[:-1]

  for target in data['Play'].unique():
    prior = priorProbability(data, target)
    likelihood = 1.0

    for i in range(len(features)):
      X = features[i]
      X_val = Xt[i]
      likelihood = likelihood*cps[target][X][X_val]

    posterior = prior*likelihood
    print(target,posterior,prior,likelihood)

In [99]:
predict(data, Xt)

no 0.006857142857142858 0.35714285714285715 0.019200000000000002
yes 0.013967202857142858 0.6428571428571429 0.02172676


# Naive Bayes Using sklearn

In [100]:
from sklearn.preprocessing import LabelEncoder

In [101]:
le1 = LabelEncoder()
data['Outlook'] = le1.fit_transform(data['Outlook'])

In [102]:
le2 = LabelEncoder()
data['Temperature'] = le2.fit_transform(data['Temperature'])

In [103]:
le3 = LabelEncoder()
data['Humidity'] = le3.fit_transform(data['Humidity'])

In [104]:
le4 = LabelEncoder()
data['Windy'] = le4.fit_transform(data['Windy'])

In [105]:
le5 = LabelEncoder()
data['Play'] = le5.fit_transform(data['Play'])

In [106]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,2,1,0,0,0
1,2,1,0,1,0
2,0,1,0,0,1
3,1,2,0,0,1
4,1,0,1,0,1
5,1,0,1,1,0
6,0,0,1,1,1
7,2,2,0,0,0
8,2,0,1,0,1
9,1,2,1,0,1


In [52]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [54]:
from sklearn.naive_bayes import CategoricalNB

In [56]:
model = CategoricalNB()

In [57]:
model.fit(X,y)

In [107]:
Xt = ["sunny", "hot", "normal", False]

In [108]:
le1.transform(['sunny'])

array([2])

In [109]:
le2.transform(['hot'])

array([1])

In [110]:
le3.transform(['normal'])

array([1])

In [111]:
le4.transform([False])

array([0])

In [115]:
Xt = np.array([[2,1,1,0]])

In [116]:
model.predict(Xt)



array([1])

In [117]:
model.predict_proba(Xt)



array([[0.33508723, 0.66491277]])