In [34]:

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import datasets
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

# Load Iris dataset

In [80]:
df = px.data.iris()
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [81]:
species =df['species']
df.drop('species', axis=1, inplace=True)

In [82]:
corr_matrix = df.corr()
corr_matrix["species_id"].sort_values(ascending = False)

species_id      1.000000
petal_width     0.956464
petal_length    0.949043
sepal_length    0.782561
sepal_width    -0.419446
Name: species_id, dtype: float64

## Feature engineering
### I take only the strong positive features and the strong negative feature. So i discard sepal_length because it's a weak positive correlation.

In [92]:
interestingFeatures = ['petal_length', 'petal_width','sepal_width']

In [56]:
px.scatter_3d(df, x= 'petal_length', y='petal_width', z= 'sepal_width',color = species)

# Split Train and Test

In [93]:
df.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_id
0,5.1,3.5,1.4,0.2,1


In [94]:
df.iloc[:,:4].head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2


In [70]:
#X = df.iloc[:,:4]
#X.tail(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
149,5.9,3.0,5.1,1.8


In [95]:
X =df.loc[:,interestingFeatures]
y = df["species_id"]

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df['species_id'], random_state=42, train_size = .80) 

# Scaling 

In [137]:
#MinMaxScaler is useful when the data has a bounded range or when the distribution is not Gaussian.
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train).copy()
X_test = scaler.fit_transform(X_test).copy()

# Naive Bayes

<img src= "https://miro.medium.com/v2/resize:fit:1400/1*LWbqBJ_LsZNbBEiF8DN_eQ.png" style="width:100%">

# Bernoulli Naive Bayes
### Suitable for **binary** feature data, where features are either present or absent.

In [138]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [140]:
print("-------------------------------------------------------------------------")
print(f"The accuracy score is:   {accuracy_score(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Confusion Matrix is:  \n{confusion_matrix(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Classification Report is: \n {classification_report(y_test,y_pred)}")

-------------------------------------------------------------------------
The accuracy score is:   0.6666666666666666
-------------------------------------------------------------------------
The Confusion Matrix is:  
[[ 9  0  1]
 [ 0  1  9]
 [ 0  0 10]]
-------------------------------------------------------------------------
The Classification Report is: 
               precision    recall  f1-score   support

           1       1.00      0.90      0.95        10
           2       1.00      0.10      0.18        10
           3       0.50      1.00      0.67        10

    accuracy                           0.67        30
   macro avg       0.83      0.67      0.60        30
weighted avg       0.83      0.67      0.60        30



# Multinomial Naive Bayes
### Used for **discrete** data, often applied to document classification where the features are word frequencies.

In [141]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [142]:
print("-------------------------------------------------------------------------")
print(f"The accuracy score is:   {accuracy_score(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Confusion Matrix is:  \n{confusion_matrix(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Classification Report is: \n {classification_report(y_test,y_pred)}")

-------------------------------------------------------------------------
The accuracy score is:   0.8666666666666667
-------------------------------------------------------------------------
The Confusion Matrix is:  
[[10  0  0]
 [ 0  7  3]
 [ 0  1  9]]
-------------------------------------------------------------------------
The Classification Report is: 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        10
           2       0.88      0.70      0.78        10
           3       0.75      0.90      0.82        10

    accuracy                           0.87        30
   macro avg       0.88      0.87      0.87        30
weighted avg       0.88      0.87      0.87        30



# Gaussian Naive bayes
### Assumes that the **continuous** values associated with each class are distributed according to a Gaussian distribution.

In [160]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [144]:
print("-------------------------------------------------------------------------")
print(f"The accuracy score is:   {accuracy_score(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Confusion Matrix is:  \n{confusion_matrix(y_test,y_pred)}")
print("-------------------------------------------------------------------------")
print(f"The Classification Report is: \n {classification_report(y_test,y_pred)}")

-------------------------------------------------------------------------
The accuracy score is:   0.9333333333333333
-------------------------------------------------------------------------
The Confusion Matrix is:  
[[10  0  0]
 [ 0  8  2]
 [ 0  0 10]]
-------------------------------------------------------------------------
The Classification Report is: 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        10
           2       1.00      0.80      0.89        10
           3       0.83      1.00      0.91        10

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



   NB           |  accuracy All features| accuracy interesting Features|
----------------|-----------------------|------------------------------| 
Bernoulli NB    |       63%             |                63%           |
Multinomial NB  |       82%             |                79%           |
Gaussian NB     |       87%             |                92%           |