# Naïve Bayes Classification

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

## Load the dataset weather.csv.

In [5]:
df = pd.read_csv('data/weather.csv')

print(df.head(), '\n')
print(df.info())

    outlook  temperature  humidity  windy play
0     sunny           85        85  False   no
1     sunny           80        90   True   no
2  overcast           83        86  False  yes
3     rainy           70        96  False  yes
4     rainy           68        80  False  yes 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   outlook      14 non-null     object
 1   temperature  14 non-null     int64 
 2   humidity     14 non-null     int64 
 3   windy        14 non-null     bool  
 4   play         14 non-null     object
dtypes: bool(1), int64(2), object(2)
memory usage: 594.0+ bytes
None


## Convert categorical variables to dummy variables

In [6]:
df_dummies = pd.get_dummies(df, columns=['outlook', 'windy', 'play'], dtype=float)

df_dummies = df_dummies.astype(float)
print(df_dummies.head())

   temperature  humidity  outlook_overcast  outlook_rainy  outlook_sunny  \
0         85.0      85.0               0.0            0.0            1.0   
1         80.0      90.0               0.0            0.0            1.0   
2         83.0      86.0               1.0            0.0            0.0   
3         70.0      96.0               0.0            1.0            0.0   
4         68.0      80.0               0.0            1.0            0.0   

   windy_False  windy_True  play_no  play_yes  
0          1.0         0.0      1.0       0.0  
1          0.0         1.0      1.0       0.0  
2          1.0         0.0      0.0       1.0  
3          1.0         0.0      0.0       1.0  
4          1.0         0.0      0.0       1.0  


## Drop the target attribute 'play_no'.

In [7]:
df_dummies = df_dummies.drop(columns=['play_no'])

## Separate the features from the target attribute

In [8]:
print(df_dummies.loc[df_dummies['play_yes'] == 1.0])

X = df_dummies.drop('play_yes', axis=1)
y = df_dummies['play_yes']

    temperature  humidity  outlook_overcast  outlook_rainy  outlook_sunny  \
2          83.0      86.0               1.0            0.0            0.0   
3          70.0      96.0               0.0            1.0            0.0   
4          68.0      80.0               0.0            1.0            0.0   
6          64.0      65.0               1.0            0.0            0.0   
8          69.0      70.0               0.0            0.0            1.0   
9          75.0      80.0               0.0            1.0            0.0   
10         75.0      70.0               0.0            0.0            1.0   
11         72.0      90.0               1.0            0.0            0.0   
12         81.0      75.0               1.0            0.0            0.0   

    windy_False  windy_True  play_yes  
2           1.0         0.0       1.0  
3           1.0         0.0       1.0  
4           1.0         0.0       1.0  
6           0.0         1.0       1.0  
8           1.0         0.0  

##  Construct and train a Gaussian Naive Bayes classifier

In [9]:
model = GaussianNB()
model.fit(X, y)

## Determine the class label for a new day

In [10]:
new_day = pd.DataFrame({
    'temperature': [66],
    'humidity': [90],
    'outlook_overcast': [0],
    'outlook_rainy': [0],
    'outlook_sunny': [1],
    'windy_False': [0],
    'windy_True': [1]    
}).astype(float)

prediction = model.predict(new_day)
print(f"The class label of 'play_yes' for the new day is: {prediction[0]}")

The class label of 'play_yes' for the new day is: 0.0


## Print out the likelihoods of play = yes and play = no

In [11]:
probabilities = model.predict_proba(new_day)
print(f"Likelihood of play = yes: {probabilities[0][1]}")
print(f"Likelihood of play = no: {probabilities[0][0]}")

Likelihood of play = yes: 4.531659245941506e-05
Likelihood of play = no: 0.9999546834075409
