# Naive Bayes Classifier in Python

## Import Libraries

In [None]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import datasets

## Import the dataset

In [None]:
# import the dataset
iris = datasets .load_iris()
df = pd.DataFrame(iris.data)
df[4]=iris.target
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.head()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Exploratory data analysis

In [None]:
# view dimensions of dataset
df.shape

(150, 5)

The dataset has 150 entries/rows and 5 columns.

In [None]:
# view summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sepal_len  150 non-null    float64
 1   sepal_wid  150 non-null    float64
 2   petal_len  150 non-null    float64
 3   petal_wid  150 non-null    float64
 4   class      150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
# check missing values in dataset
df.isnull().sum()

sepal_len    0
sepal_wid    0
petal_len    0
petal_wid    0
class        0
dtype: int64

There are no missing values in the categorical variables.

In [None]:
x = df.drop(['class'], axis=1)
x.head()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
y = df['class']
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: class, Length: 150, dtype: int64

In [None]:
# check labels in target variable
y.unique()

array([0, 1, 2])

In [None]:
# value counts for target variable
y.value_counts()

2    50
1    50
0    50
Name: class, dtype: int64

The target variabke has 3 labels and there are 50 entries for each label.

## Splitting dataset into Training and Testing Set

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 7)

In [None]:
x_train.shape[0], x_test.shape[0]

(120, 30)

There are 120 entries in our training dataset and 30 entries in our testing dataset.

In [None]:
y_train.value_counts()

0    43
2    39
1    38
Name: class, dtype: int64

In [None]:
# # To encode categorical variables with one-hot encoding
# import category_encoders as ce

# encoder = ce.OneHotEncoder(cols=['columns', 'to', 'encode'])
# x_train = encoder.fit_transform(X_train)
# x_test = encoder.transform(X_test)

## Feature Scaling

In [None]:
# Normalise this dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Model Training

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

# instantiate the model
gaussianNaiveBayes = GaussianNB()

# fit the model
gaussianNaiveBayes.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Prediction

In [None]:
# Predict the results
y_pred = gaussianNaiveBayes.predict(x_test)
y_pred

array([2, 1, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 0, 2, 2, 2, 0, 0, 1, 2,
       1, 1, 2, 2, 1, 1, 2, 2])

## Accuracy of Model

In [None]:
# accuracy of model on testing dataset
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
acc

0.8333333333333334

Here, y_test are the true class labels and y_pred are the predicted class labels in the test-set.

## Confusion Matrix

In [None]:
# The Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n', confusionMatrix)

Confusion matrix
 [[7 0 0]
 [0 9 3]
 [0 2 9]]


## Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.82      0.75      0.78        12
           2       0.75      0.82      0.78        11

    accuracy                           0.83        30
   macro avg       0.86      0.86      0.86        30
weighted avg       0.84      0.83      0.83        30

