# Tennis Weather Dataset

In [1]:
import pandas as pd

In [2]:
tennis = pd.read_csv("dataset/tennis.csv")
tennis

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


# Naïve Bayesian 

NB의 기본개념: 주어진 데이터 변수에서 sunny/mild/high/true 조건이 주어짐을 가정했을때, 발생할수있는 Yes/No의 확률을 
조건부 확률로 계산하여 더 높은 확률을 가지는 결과를 예측하는 방법

P(yes| sunny,mild,high,true) => sunny/mild/high/true 조건이 주어졌을시, Yes가 될 확률 <br/>
P(no| sunny,mild,high,true) => sunny/mild/high/true 조건이 주어졌을시, No가 될 확률 <br/>

계산방법: <br/>
P(yes| sunny,mild,high,true) = P(sunny,mild,high,true | yes) * P(yes) / P(sunny,mild,high,true) <br/>
P(no| sunny,mild,high,true) = P(sunny,mild,high,true | yes) * P(no) / P(sunny,mild,high,true) <br/>

P(sunny,mild,high,true | yes) = P(sunny | yes) * P(mild | yes) * P(high | yes) * P(true | yes)<br/>


## 데이터를 통해 실제 계산 적용해보기

### P(yes) / P(no) 계산법

In [43]:
tennis["play"].value_counts()

yes    9
no     5
Name: play, dtype: int64

In [44]:
P_yes = tennis["play"].value_counts()[0]/len(tennis)
P_no = tennis["play"].value_counts()[1]/len(tennis)

print("P(yes) = ",P_yes)
print("P(no) = ",P_no)

P(yes) =  0.6428571428571429
P(no) =  0.35714285714285715


### P(sunny | yes) ,P(mild | yes) ,P(high | yes) ,P(false | yes) 의 계산법

In [45]:
# Play가 yes인 경우로 나눔
tennis_yes = tennis[tennis["play"]=="yes"]
tennis_yes

Unnamed: 0,outlook,temp,humidity,windy,play
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
6,overcast,cool,normal,True,yes
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes
10,sunny,mild,normal,True,yes
11,overcast,mild,high,True,yes
12,overcast,hot,normal,False,yes


In [46]:
tennis_yes["outlook"].value_counts()

overcast    4
rainy       3
sunny       2
Name: outlook, dtype: int64

In [47]:
tennis_yes["temp"].value_counts()

mild    4
cool    3
hot     2
Name: temp, dtype: int64

In [48]:
tennis_yes["humidity"].value_counts()

normal    6
high      3
Name: humidity, dtype: int64

In [49]:
tennis_yes["windy"].value_counts()

False    6
True     3
Name: windy, dtype: int64

In [66]:
sunny_yes = tennis_yes["outlook"].value_counts().loc["sunny"] / len(tennis_yes)
mild_yes = tennis_yes["temp"].value_counts().loc["mild"] / len(tennis_yes)
high_yes = tennis_yes["humidity"].value_counts().loc["high"] / len(tennis_yes)
true_yes = tennis_yes["windy"].value_counts().loc[True] / len(tennis_yes)

print("P(sunny | yes) = ", sunny_yes)
print("P(mild | yes) = ", mild_yes)
print("P(high | yes) = ", high_yes)
print("P(true | yes) = ", true_yes)

print("P(sunny,mild,high,true | yes) =",sunny_yes*mild_yes*high_yes*true_yes)

P(sunny | yes) =  0.2222222222222222
P(mild | yes) =  0.4444444444444444
P(high | yes) =  0.3333333333333333
P(true | yes) =  0.3333333333333333
P(sunny,mild,high,true | yes) = 0.010973936899862823


### P(sunny | no) ,P(mild | no) ,P(high | no) ,P(false | no) 의 계산법

In [67]:
# Play가 no 경우로 나눔
tennis_no = tennis[tennis["play"]=="no"]
tennis_no

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
5,rainy,cool,normal,True,no
7,sunny,mild,high,False,no
13,rainy,mild,high,True,no


In [68]:
tennis_no["outlook"].value_counts()

sunny    3
rainy    2
Name: outlook, dtype: int64

In [69]:
tennis_no["temp"].value_counts()

mild    2
hot     2
cool    1
Name: temp, dtype: int64

In [70]:
tennis_no["humidity"].value_counts()

high      4
normal    1
Name: humidity, dtype: int64

In [55]:
tennis_no["windy"].value_counts()

True     3
False    2
Name: windy, dtype: int64

In [71]:
sunny_no = tennis_no["outlook"].value_counts().loc["sunny"] / len(tennis_no)
mild_no = tennis_no["temp"].value_counts().loc["mild"] / len(tennis_no)
high_no = tennis_no["humidity"].value_counts().loc["high"] / len(tennis_no)
true_no = tennis_no["windy"].value_counts().loc[True] / len(tennis_no)

print("P(sunny | no) = ", sunny_no)
print("P(mild | no) = ", mild_no)
print("P(high | no) = ", high_no)
print("P(true | no) = ", true_no)

print("P(sunny,mild,high,true | no) =",sunny_no*mild_no*high_no*true_no)

P(sunny | no) =  0.6
P(mild | no) =  0.4
P(high | no) =  0.8
P(true | no) =  0.6
P(sunny,mild,high,true | no) = 0.1152


### P(sunny,mild,high,false)의 계산

P(yes| sunny,mild,high,true) + P(no| sunny,mild,high,true) = 1 인것을 활용하여, <br/>
P(sunny,mild,high,true | yes) * P(yes) / P(sunny,mild,high,true) + P(sunny,mild,high,true | yes) * P(no) / P(sunny,mild,high,true) = 1 이기때문에, 
P(sunny,mild,high,true) = P(sunny,mild,high,true | yes) * P(yes) + P(sunny,mild,high,true | yes) * P(no) 


In [72]:
p_smhf = sunny_yes*mild_yes*high_yes*true_yes*P_yes + sunny_no*mild_no*high_no*true_no*P_no
print("P(sunny,mild,high,true) =",p_smhf)

P(sunny,mild,high,true) = 0.04819753086419753


### P(yes| sunny,mild,high,false)의 계산 
P(yes| sunny,mild,high,false) = P(sunny,mild,high,false | yes) * P(yes) / P(sunny,mild,high,false)

In [76]:
p_y = sunny_yes*mild_yes*high_yes*true_yes*P_yes / p_smhf
p_n = sunny_no*mild_no*high_no*true_no*P_no / p_smhf

print("P(yes| sunny,mild,high,true) = ", p_y)
print("P(no| sunny,mild,high,true) = ", p_n)

P(yes| sunny,mild,high,true) =  0.14637002341920372
P(no| sunny,mild,high,true) =  0.8536299765807962


## NB가 잘 적용된 사례: Spam Mail 분류

## NB의 장점:
- 적용하기 쉽고 알고리즘을 이해하기 쉽다. 
- 데이터셋이 많은 경우에도 잘 작용한다.


## NB의 단점:
- 데이터가 너무 적은 경우, 몇몇 조건에는 해당하는 확률이 부여되지 못해 0이 되는 부분이 존재 할 수 있다. 따라서 데이터가 너무 작은 경우의 사용에는 주의할 필요가 있다.

# Gaussian NB 적용시켜보기

In [88]:
# 더미 변수로 변경 
X_train = pd.get_dummies(tennis[['outlook', 'temp', 'humidity', 'windy']])
y_train = pd.DataFrame(tennis['play'])

In [90]:
X_train

Unnamed: 0,windy,outlook_overcast,outlook_rainy,outlook_sunny,temp_cool,temp_hot,temp_mild,humidity_high,humidity_normal
0,False,0,0,1,0,1,0,1,0
1,True,0,0,1,0,1,0,1,0
2,False,1,0,0,0,1,0,1,0
3,False,0,1,0,0,0,1,1,0
4,False,0,1,0,1,0,0,0,1
5,True,0,1,0,1,0,0,0,1
6,True,1,0,0,1,0,0,0,1
7,False,0,0,1,0,0,1,1,0
8,False,0,0,1,1,0,0,0,1
9,False,0,1,0,0,0,1,0,1


In [91]:
# P(yes| sunny,mild,high,true)의 경우
test1 = [True,0,0,1,0,0,1,1,0]

In [96]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import numpy as np

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(X_train, y_train)

#Predict Output 
predicted= model.predict([test1])
print (predicted)

['no']


  return f(*args, **kwargs)
