In [310]:
# Naive Bayes Algorithm Undestanding

In [311]:
columns = ['outlook', 'temperature', 'humidity', 'windy', 'play']
data = [
    ['rainy',    'hot',  'high',  'false', 'no'], 
    ['rainy',    'hot',  'high',  'true',  'no'], 
    ['overcast', 'hot',  'high',  'false', 'yes'], 
    ['sunny',    'mild', 'high',  'false', 'yes'], 
    ['sunny',    'cool', 'normal','false', 'yes'], 
    ['sunny',    'cool', 'normal','true',  'no'], 
    ['overcast', 'cool', 'normal','true',  'yes'], 
    ['rainy',    'mild', 'high',  'false', 'no'], 
    ['rainy',    'cool', 'normal','false', 'yes'], 
    ['sunny',    'mild', 'normal','false', 'yes'], 
    ['rainy',    'mild', 'normal','true',  'yes'], 
    ['overcast', 'mild', 'high',  'true',  'yes'], 
    ['overcast', 'hot',  'normal','false', 'yes'], 
    ['sunny',    'mild', 'high',  'true',  'no'], 
]

In [312]:
import pandas as pd
df = pd.DataFrame(data, columns = columns)

df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,rainy,hot,high,False,no
1,rainy,hot,high,True,no
2,overcast,hot,high,False,yes
3,sunny,mild,high,False,yes
4,sunny,cool,normal,False,yes
5,sunny,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,rainy,mild,high,False,no
8,rainy,cool,normal,False,yes
9,sunny,mild,normal,False,yes


**The Bayes Theorem**

$
\Large
P(c | x) = \frac{P(x | c) \cdot P(c)}{P(x)} 
$

where:
 - P(c): prior probability of class
 - P(x): prior probability of predictor
 - P(c | x): posterior probability of class given an predictor ( an attribute)
 - P(x | c): likehood - the probability of an predictor given a class

and

$
\Large
P(x | c) = P(x_1 | c) \cdot P(x_2 | c) \cdot \ldots P(x_n | c)  
$


Then:

$
\Large
P(c | x) = P(x | c) = P(x_1 | c) \cdot P(x_2 | c) \cdot \ldots P(x_n | c) \cdot P(c)
$

In [313]:
prob = {} # defaultdict(int)

prob['yes'] = len(df[df.play == 'yes'])/ len(df)
prob['no']  = len(df[df.play == 'no'])/ len(df)

In [314]:
targets = ('yes', 'no')
feature_values = {
    'outlook':('sunny', 'overcast', 'rainy'), 
    'temperature': ('hot', 'mild', 'cool'),
    'humidity': ('high', 'normal'), 
    'windy': ('false', 'true')
}

In [315]:
# Prior and Posterior probabilities from data frequency
for target in targets:
    for feature in feature_values:
        for feature_value in feature_values[feature]:
            # frequency of this target
            frequency_target = len(df[(df.play==target)])
            
            # frequency attribute with target
            frequency_atrb_value = len(df[(df[feature]==feature_value)])
            
            # frequency attribute with target
            frequency_atrb_with_target = len(df[(df[feature]==feature_value) & (df.play=='yes')])
            
            prob['%s' %(feature_value)] = frequency_atrb_value  / len(df)
            
            prob['%s|%s' %(feature_value,target)] = frequency_atrb_with_target / frequency_target
            
for index, key in enumerate(prob):
    print('%d. p(%s) = %.2f' %(index, key, prob[key]))

0. p(yes) = 0.64
1. p(no) = 0.36
2. p(sunny) = 0.36
3. p(sunny|yes) = 0.33
4. p(overcast) = 0.29
5. p(overcast|yes) = 0.44
6. p(rainy) = 0.36
7. p(rainy|yes) = 0.22
8. p(hot) = 0.29
9. p(hot|yes) = 0.22
10. p(mild) = 0.43
11. p(mild|yes) = 0.44
12. p(cool) = 0.29
13. p(cool|yes) = 0.33
14. p(high) = 0.50
15. p(high|yes) = 0.33
16. p(normal) = 0.50
17. p(normal|yes) = 0.67
18. p(false) = 0.57
19. p(false|yes) = 0.67
20. p(true) = 0.43
21. p(true|yes) = 0.33
22. p(sunny|no) = 0.60
23. p(overcast|no) = 0.80
24. p(rainy|no) = 0.40
25. p(hot|no) = 0.40
26. p(mild|no) = 0.80
27. p(cool|no) = 0.60
28. p(high|no) = 0.60
29. p(normal|no) = 1.20
30. p(false|no) = 1.20
31. p(true|no) = 0.60


In [316]:
# Play golf or not ?
x = {
    'outlook':'rainy', 
    'temperature':'cool', 
    'humidity':'high', 
    'windy':'true', 
    'play_golf':'?'}

In [317]:
# prob (yes | x)
p_yes_x = prob['rainy|yes'] * prob['cool|yes'] *prob['high|yes'] *prob['true|yes'] * prob['yes']

p_yes_x

0.005291005291005291

In [318]:
# prob (no | x)
p_no_x = prob['rainy|no'] * prob['cool|no'] *prob['high|no'] *prob['true|no'] * prob['no']

p_no_x

0.030857142857142854

In [319]:
# Standardze the posterior probability between 0 and 1
p_yes_x / (p_yes_x+p_no_x)

0.14637002341920374

In [320]:
# Standardze the posterior probability between 0 and 1
p_no_x / (p_yes_x+p_no_x)

0.8536299765807962

**Numerical values**
- Common practice: let's assume normal distribution to this feature values

Probability Density Function for Normal Distribution: ...

$
f(x, \mu_{c}, \sigma_{c}) = 
$

Mean: 

$
\mu_c = \frac{1}{n}\sum_{i=1}^{n} x_i
$

Standard deviation:

$
\sigma_c = \sqrt{\frac{1}{n-1} \sum_{i=1}^{n} (x_i - \mu)^2}
$


In [321]:
df = pd.get_dummies(df, columns=['outlook', 'temperature', 'humidity', 'windy'])

In [322]:
df

Unnamed: 0,play,outlook_overcast,outlook_rainy,outlook_sunny,temperature_cool,temperature_hot,temperature_mild,humidity_high,humidity_normal,windy_false,windy_true
0,no,0,1,0,0,1,0,1,0,1,0
1,no,0,1,0,0,1,0,1,0,0,1
2,yes,1,0,0,0,1,0,1,0,1,0
3,yes,0,0,1,0,0,1,1,0,1,0
4,yes,0,0,1,1,0,0,0,1,1,0
5,no,0,0,1,1,0,0,0,1,0,1
6,yes,1,0,0,1,0,0,0,1,0,1
7,no,0,1,0,0,0,1,1,0,1,0
8,yes,0,1,0,1,0,0,0,1,1,0
9,yes,0,0,1,0,0,1,0,1,1,0


In [323]:
df.columns

Index(['play', 'outlook_overcast', 'outlook_rainy', 'outlook_sunny',
       'temperature_cool', 'temperature_hot', 'temperature_mild',
       'humidity_high', 'humidity_normal', 'windy_false', 'windy_true'],
      dtype='object')

In [324]:
targets = ('yes', 'no')
feature_values = {
    'outlook_overcast':(0,1), 'outlook_rainy':(0,1), 'outlook_sunny':(0,1),
       'temperature_cool':(0,1), 'temperature_hot':(0,1), 'temperature_mild':(0,1),
       'humidity_high':(0,1), 'humidity_normal':(0,1), 'windy_false':(0,1), 'windy_true':(0,1)
}

In [325]:
prob = {} # defaultdict(int)

prob['yes'] = len(df[df.play == 'yes'])/ len(df)
prob['no']  = len(df[df.play == 'no'])/ len(df)

In [345]:
# Prior and Posterior probabilities from data frequency
for target in targets:
    for feature in feature_values:
        for feature_value in feature_values[feature]:
            subdataset = df[(df[feature]==feature_value) & (df.play == target)]
            u = subdataset[feature].mean()
            o = subdataset[feature].std()
            print('u', u)
            print('o',o)
            #prob['%s=%s|%s' %(feature,feature_value,target)] = '?' #'g(x, %.2f, %.2f)' %(u, o)            
        
    
    
    
#for index, key in enumerate(prob):
#    print('%d. p(%s) = %s' %(index, key, prob[key]))

u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u nan
o nan
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o nan
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o nan
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o nan
u 0.0
o 0.0
u 1.0
o 0.0
u 0.0
o 0.0
u 1.0
o 0.0
