#### Import the necessary libraries

In [84]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
import warnings
from sklearn.preprocessing import LabelEncoder

In [85]:
warnings.filterwarnings('ignore')

#### Readin the file 

In [86]:
train_data = pd.read_csv('weather_dataset_Continuous.csv')

In [87]:
train_data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Class
0,Rainy,Hot,5,92,No
1,Rainy,Hot,6,56,Yes
2,Overcast,Hot,10,67,Yes
3,Sunny,Mild,50,88,No
4,Sunny,Cool,21,34,Yes
5,Sunny,Cool,30,21,No
6,Overcast,Cool,42,75,Yes
7,Rainy,Mild,67,45,No
8,Rainy,Cool,35,65,Yes
9,Sunny,Mild,30,71,Yes


In [88]:
train_data.shape

(14, 5)

In [89]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     int64 
 3   Windy        14 non-null     int64 
 4   Class        14 non-null     object
dtypes: int64(2), object(3)
memory usage: 688.0+ bytes


#### 'Class' is the target label.

In [90]:
#calculating all the probabilities using training data

total_count = train_data.shape[0]

Yes_data = train_data[train_data['Class'] == 'Yes']
No_data = train_data[train_data['Class'] == 'No']

Yes_count = Yes_data.shape[0]
No_count = No_data.shape[0]

In [91]:
train_data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Class
0,Rainy,Hot,5,92,No
1,Rainy,Hot,6,56,Yes
2,Overcast,Hot,10,67,Yes
3,Sunny,Mild,50,88,No
4,Sunny,Cool,21,34,Yes
5,Sunny,Cool,30,21,No
6,Overcast,Cool,42,75,Yes
7,Rainy,Mild,67,45,No
8,Rainy,Cool,35,65,Yes
9,Sunny,Mild,30,71,Yes


#### calculate individual values for each feature and the corresponding categorical value

#### 1. Data distribution for Yes

In [92]:
data_distribution = pd.DataFrame([], columns=['Total', 'Outlook_rainy', 'Outlook_Overcast', 'Outlook_Sunny',
                                              'Temperature_Hot', 'Temperature_Mild', 'Temperature_Cool',
                                             'Humidity_mean', 'Humidity_std','Windy_mean', 'Windy_std' ])

In [93]:
yes_outlook_rainy = Yes_data[Yes_data['Outlook'] == 'Rainy'].shape[0]
yes_outlook_Overcast = Yes_data[Yes_data['Outlook'] == 'Overcast'].shape[0]
yes_outlook_Sunny = Yes_data[Yes_data['Outlook'] == 'Sunny'].shape[0]

yes_Temperature_Hot = Yes_data[Yes_data['Temperature'] == 'Hot'].shape[0]
yes_Temperature_Mild = Yes_data[Yes_data['Temperature'] == 'Mild'].shape[0]
yes_Temperature_Cool = Yes_data[Yes_data['Temperature'] == 'Cool'].shape[0]

yes_Humidity_mean = np.mean(Yes_data['Humidity'])
yes_Humidity_std = np.std(Yes_data['Humidity'])

yes_Windy_mean = np.mean(Yes_data['Windy'])
yes_Windy_std = np.std(Yes_data['Windy'])

In [94]:
data_distribution.loc['Yes', 'Total'] = Yes_count
data_distribution.loc['Yes', 'Outlook_rainy'] = yes_outlook_rainy
data_distribution.loc['Yes', 'Outlook_Overcast'] = yes_outlook_Overcast
data_distribution.loc['Yes', 'Outlook_Sunny'] = yes_outlook_Sunny
data_distribution.loc['Yes', 'Temperature_Hot'] = yes_Temperature_Hot
data_distribution.loc['Yes', 'Temperature_Mild'] = yes_Temperature_Mild
data_distribution.loc['Yes', 'Temperature_Cool'] = yes_Temperature_Cool
data_distribution.loc['Yes', 'Humidity_mean'] = yes_Humidity_mean
data_distribution.loc['Yes', 'Humidity_std'] = yes_Humidity_std
data_distribution.loc['Yes', 'Windy_mean'] = yes_Windy_mean
data_distribution.loc['Yes', 'Windy_std'] = yes_Windy_std

In [95]:
print('Total counts of Yes:', Yes_count)
print('\nrainy Outlook under Yes:', yes_outlook_rainy)
print('Overcast Outlook under Yes:', yes_outlook_Overcast)
print('Sunny Outlook under Yes:', yes_outlook_Sunny)
print('\nHot Temperature under Yes:', yes_Temperature_Hot)
print('Mild Temperature under Yes:', yes_Temperature_Mild)
print('Cool Temperature under Yes:', yes_Temperature_Cool)
print('\nHigh Humidity under Yes:', yes_Humidity_mean)
print('Normal Humidity under Yes:', yes_Humidity_std)
print('\nYes Windy under Yes:', yes_Windy_mean)
print('No Windy under Yes:', yes_Windy_std)

Total counts of Yes: 9

rainy Outlook under Yes: 3
Overcast Outlook under Yes: 4
Sunny Outlook under Yes: 2

Hot Temperature under Yes: 3
Mild Temperature under Yes: 3
Cool Temperature under Yes: 3

High Humidity under Yes: 32.55555555555556
Normal Humidity under Yes: 20.597255001097764

Yes Windy under Yes: 53.111111111111114
No Windy under Yes: 17.697736480782222


#### 2. Data distribution for No

In [96]:
no_outlook_rainy = No_data[No_data['Outlook'] == 'Rainy'].shape[0]
no_outlook_Overcast = No_data[No_data['Outlook'] == 'Overcast'].shape[0]
no_outlook_Sunny = No_data[No_data['Outlook'] == 'Sunny'].shape[0]

no_Temperature_Hot = No_data[No_data['Temperature'] == 'Hot'].shape[0]
no_Temperature_Mild = No_data[No_data['Temperature'] == 'Mild'].shape[0]
no_Temperature_Cool = No_data[No_data['Temperature'] == 'Cool'].shape[0]

no_Humidity_mean = np.mean(No_data['Humidity'])
no_Humidity_std = np.std(No_data['Humidity'])

no_Windy_mean = np.mean(No_data['Windy'])
no_Windy_std = np.std(No_data['Windy'])

In [97]:
data_distribution.loc['No', 'Total'] = No_count
data_distribution.loc['No', 'Outlook_rainy'] = no_outlook_rainy
data_distribution.loc['No', 'Outlook_Overcast'] = no_outlook_Overcast
data_distribution.loc['No', 'Outlook_Sunny'] = no_outlook_Sunny
data_distribution.loc['No', 'Temperature_Hot'] = no_Temperature_Hot
data_distribution.loc['No', 'Temperature_Mild'] = no_Temperature_Mild
data_distribution.loc['No', 'Temperature_Cool'] = no_Temperature_Cool
data_distribution.loc['No', 'Humidity_mean'] = no_Humidity_mean
data_distribution.loc['No', 'Humidity_std'] = no_Humidity_std
data_distribution.loc['No', 'Windy_mean'] = no_Windy_mean
data_distribution.loc['No', 'Windy_std'] = no_Windy_std

In [98]:
print('Total counts of No:', No_count)
print('\nrainy Outlook under No:', no_outlook_rainy)
print('Overcast Outlook under No:', no_outlook_Overcast)
print('Sunny Outlook under No:', no_outlook_Sunny)
print('\nHot Temperature under No:', no_Temperature_Hot)
print('Mild Temperature under No:', no_Temperature_Mild)
print('Cool Temperature under No:', no_Temperature_Cool)
print('\nHigh Humidity under No:', no_Humidity_mean)
print('Normal Humidity under No:', no_Humidity_std)
print('\nYes Windy under No:', no_Windy_mean)
print('No Windy under No:', no_Windy_std)

Total counts of No: 5

rainy Outlook under No: 2
Overcast Outlook under No: 0
Sunny Outlook under No: 3

Hot Temperature under No: 1
Mild Temperature under No: 3
Cool Temperature under No: 1

High Humidity under No: 46.6
Normal Humidity under No: 26.8968399630886

Yes Windy under No: 62.6
No Windy under No: 26.717784339274843


In [99]:
data_distribution

Unnamed: 0,Total,Outlook_rainy,Outlook_Overcast,Outlook_Sunny,Temperature_Hot,Temperature_Mild,Temperature_Cool,Humidity_mean,Humidity_std,Windy_mean,Windy_std
Yes,9,3,4,2,3,3,3,32.5556,20.5973,53.1111,17.6977
No,5,2,0,3,1,3,1,46.6,26.8968,62.6,26.7178


In [113]:
data_distribution.to_csv('data.csv')

#### lets pick  and see the probability for the below condition
Outlook = 'Rainy' ,
Temperature = 'Mild' ,
Humidity = 20 ,
Windy = 19

#### Calculate the probability density function

In [100]:
#yes_humidity
numerator_yes = np.exp(-((20-yes_Humidity_mean)**2) / (2 * (yes_Humidity_std ** 2)) )
denominator_yes = np.sqrt(2 * 3.14 * (yes_Humidity_std ** 2))
yes_Humidity = numerator_yes/denominator_yes

#yes_windy
numerator_yes = np.exp(-((19-yes_Windy_mean)**2) / (2 * (yes_Windy_std ** 2)) )
denominator_yes = np.sqrt(2 * 3.14 * (yes_Windy_std ** 2))
yes_Windy = numerator_yes/denominator_yes

#no_humidity
numerator_no = np.exp(-((20-no_Humidity_mean)**2) / (2 * (no_Humidity_std ** 2)) )
denominator_no = np.sqrt(2 * 3.14 * (no_Humidity_std ** 2))
no_Humidity = numerator_no/denominator_no

#no_windy
numerator_no = np.exp(-((19-no_Windy_mean)**2) / (2 * (no_Windy_std ** 2)) )
denominator_no = np.sqrt(2 * 3.14 * (no_Windy_std ** 2))
no_Windy = numerator_no/denominator_no

In [101]:

#probability for Yes
# P('Outlook_rainy'/'Yes') * P('Temperature_Mild'/'Yes') * P('Humidity'/'Yes') * P('Windy'/Yes')
P_Yes = (yes_outlook_rainy / Yes_count) * (yes_Temperature_Mild / Yes_count) * (yes_Humidity) * (yes_Windy) * (Yes_count / total_count)

#probability for No
P_No = (no_outlook_rainy / No_count) * (no_Temperature_Mild / No_count) * (no_Humidity) * (no_Windy) * (No_count / total_count)


In [102]:
print('The probability of Yes:', P_Yes)
print('The probability of No:', P_No)

The probability of Yes: 4.04389833825209e-06
The probability of No: 3.0757201129330552e-06


Conclusion: 
There is a higher chance for 'Yes'. 

#### Redo the same process using sklearn package

In [103]:
train_data['Outlook'] = train_data['Outlook'].replace({'Rainy':0, 'Overcast':1, 'Sunny':2})
train_data['Temperature'] = train_data['Temperature'].replace({'Hot':0, 'Mild':1, 'Cool':2})
train_data['Class'] = train_data['Class'].replace({'No':0, 'Yes':1})

In [104]:
train_data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Class
0,0,0,5,92,0
1,0,0,6,56,1
2,1,0,10,67,1
3,2,1,50,88,0
4,2,2,21,34,1
5,2,2,30,21,0
6,1,2,42,75,1
7,0,1,67,45,0
8,0,2,35,65,1
9,2,1,30,71,1


In [105]:
train_x = train_data.drop('Class', axis=1)
train_y = train_data['Class']

In [106]:
train_data.columns

Index(['Outlook', 'Temperature', 'Humidity', 'Windy', 'Class'], dtype='object')

In [107]:
test_x = pd.DataFrame([], columns=['Outlook', 'Temperature', 'Humidity', 'Windy'])

In [108]:
test_x['Outlook'] = ['Rainy']
test_x['Temperature'] = 'Mild'
test_x['Humidity'] = 20
test_x['Windy'] = 19

test_x['Outlook'] = test_x['Outlook'].replace({'Rainy':0, 'Overcast':1, 'Sunny':2})
test_x['Temperature'] = test_x['Temperature'].replace({'Hot':0, 'Mild':1, 'Cool':2})


In [109]:
test_x

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,0,1,20,19


#### Use Multinomial NB

In [110]:
gaussian = GaussianNB()

In [111]:
gaussian.fit(train_x, train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [112]:
gaussian.predict(test_x)

array([1], dtype=int64)

The prediction is 'Yes'.