# Importing the libraries

In [1]:
import pandas as pd
import numpy as np

# For Gaussian Naïve Bayes
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# For Multinomial Naïve Bayes
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# For Bernoulli Naïve Bayes
from sklearn.naive_bayes import BernoulliNB

# For Complement Naïve Bayes
from sklearn.datasets import load_wine
from sklearn.metrics import classification_report
from sklearn.naive_bayes import ComplementNB

# For Categorical Naïve Bayes
from sklearn.datasets import load_digits
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import Binarizer

import warnings 
warnings.filterwarnings('ignore')

# Naïve Bayes Implementation
Naïve Bayes is a simple and efficient algorithm used for classification and prediction problems. It is based on the Bayes' theorem and the assumption of independence between features. Naïve Bayes works by calculating the probability of each class based on the features, and then selecting the class with the highest probability as the prediction.

# Gaussian Naïve Bayes
Gaussian Naïve Bayes is used when the features are continuous variables that can be modeled using a Gaussian distribution.

Iris dataset is a supervised dataset. It is a well-known example of a classification problem in machine learning.

In [2]:
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [4]:
# Create a Pandas DataFrame for the data
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
# Create a Pandas DataFrame for the target
target_names = iris.target_names

df_y = pd.DataFrame(y, columns=['target'])
df_y['target_names'] = df_y['target'].map(lambda x: target_names[x])

In [6]:
# Concatenate the feature and target DataFrames
df = pd.concat([data, df_y], axis=1)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Create a Naïve Bayes classifier
gnb = GaussianNB()

In [9]:
gnb.fit(X_train,y_train)

In [10]:
y_pred = gnb.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 97.77777777777777 %


In [12]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



# Multinomial Naïve Bayes
Multinomial Naïve Bayes is used when the features are discrete variables that represent the frequency of occurrence of certain events.

In [13]:
# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target

In [14]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [15]:
data = pd.DataFrame({'text': X, 'target': y})
data.head()

Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [16]:
# Convert the text data to a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
mnb = MultinomialNB()

In [19]:
mnb.fit(X_train, y_train)

In [20]:
y_pred = mnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 56.7032189600283 %


In [21]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       0.67      0.17      0.28       236
           1       0.42      0.75      0.54       287
           2       0.85      0.04      0.07       290
           3       0.52      0.73      0.61       285
           4       0.89      0.38      0.53       312
           5       0.73      0.68      0.71       308
           6       0.86      0.53      0.66       276
           7       0.88      0.56      0.69       304
           8       0.43      0.46      0.44       279
           9       0.98      0.63      0.77       308
          10       0.92      0.77      0.83       309
          11       0.48      0.77      0.59       290
          12       0.81      0.42      0.56       304
          13       0.83      0.70      0.76       300
          14       0.78      0.69      0.73       297
          15       0.29      0.94      0.44       292
          16       0.80      0.40      0.53       270
    

# Bernoulli Naïve Bayes
Bernoulli Naïve Bayes is used when the features are binary variables that represent the presence or absence of certain events.

In [22]:
# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target

In [23]:
data = pd.DataFrame({'text': X, 'target': y})
data.head()

Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [24]:
# Convert the text data to a matrix of binary values
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
bnb = BernoulliNB()

In [27]:
bnb.fit(X_train, y_train)

In [28]:
y_pred = bnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 42.28864520693315 %


In [29]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       1.00      0.01      0.03       236
           1       0.61      0.30      0.41       287
           2       0.75      0.02      0.04       290
           3       0.34      0.78      0.47       285
           4       0.96      0.24      0.38       312
           5       0.95      0.48      0.63       308
           6       0.74      0.67      0.70       276
           7       0.78      0.43      0.55       304
           8       0.11      0.94      0.19       279
           9       0.84      0.67      0.75       308
          10       0.98      0.55      0.71       309
          11       0.71      0.51      0.59       290
          12       0.72      0.45      0.56       304
          13       0.93      0.38      0.54       300
          14       0.86      0.38      0.53       297
          15       0.40      0.59      0.48       292
          16       0.75      0.37      0.50       270
    

# Reasons
Why we are getting lower accuracy in Bernoulli and Multinomial Naive Bayes.
1. Data distribution.
2. Feature engineering.
3. Hyperparameter tuning.
4. Limited data.

# Complement Naive Bayes
Complement Naive Bayes is somewhat an adaptation of the standard Multinomial Naive Bayes algorithm. Multinomial Naive Bayes does not perform very well on imbalanced datasets. Imbalanced datasets are datasets where the number of examples of some class is higher than the number of examples belonging to other classes. This means that the distribution of examples is not uniform. This type of dataset can be difficult to work with as a model may easily overfit this data in favor of the class with more number of examples.

Complement Naive Bayes is particularly suited to work with imbalanced datasets. In complement Naive Bayes, instead of calculating the probability of an item belonging to a certain class, we calculate the probability of the item belonging to all the classes. This is the literal meaning of the word, complement and hence is called Complement Naive Bayes.

# When to use CNB?

When the dataset is imbalanced: If the dataset on which classification is to be done is imbalanced, Multinomial and Gaussian Naive Bayes may give a low accuracy. However, Complement Naive Bayes will perform quite well and will give relatively higher accuracy.

For text classification tasks: Complement Naive Bayes outperforms both Gaussian Naive Bayes and Multinomial Naive Bayes in text classification tasks.

In [31]:
wine = load_wine()
X = wine.data
y = wine.target

In [32]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [33]:
df_X = pd.DataFrame(wine.data, columns= wine.feature_names)
df_X.head(5)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [34]:
# Create a Pandas DataFrame for the target
target_names = wine.target_names

df_y = pd.DataFrame(y, columns=['target'])
df_y['target_names'] = df_y['target'].map(lambda x: target_names[x])

In [35]:
# Concatenate the feature and target DataFrames
df = pd.concat([df_X, df_y], axis=1)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,target_names
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0,class_0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [37]:
cnb = ComplementNB()
cnb.fit(X_train, y_train)

In [38]:
y_pred = cnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 66.66666666666666 %


In [39]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       0.64      1.00      0.78         9
           1       0.67      0.73      0.70        11
           2       1.00      0.14      0.25         7

    accuracy                           0.67        27
   macro avg       0.77      0.62      0.58        27
weighted avg       0.75      0.67      0.61        27



# Conclusion:
Now that you know what Complement Naive Bayes classifiers are and how they work, next time you come across an unbalanced dataset, you can try using Complement Naive Bayes.

# Categorical Naïve Bayes
Categorical Naive Bayes is a type of Naive Bayes algorithm that is used for classification problems. It is specifically designed for discrete and categorical data.

The algorithm assumes that the features are independent of each other, which means that the presence or absence of one feature does not affect the presence or absence of any other feature. It also assumes that each feature is equally important in predicting the class.

Categorical Naive Bayes is a simple yet effective algorithm for classification problems, especially when dealing with discrete and categorical data. However, it may not work well for continuous data, and its assumption of feature independence may not always hold true in real-world applications.

In [40]:
# Load the digits dataset
digits = load_digits()
X = digits.data
y = digits.target

In [41]:
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [42]:
df_X = pd.DataFrame(digits.data, columns= digits.feature_names)
df_X.head(5)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [43]:
# Create a Pandas DataFrame for the target
target_names = digits.target_names

df_y = pd.DataFrame(y, columns=['target'])
df_y['target_names'] = df_y['target'].map(lambda x: target_names[x])

In [44]:
# Concatenate the feature and target DataFrames
df = pd.concat([df_X, df_y], axis=1)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target,target_names
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4,4


In [45]:
print(data.shape)

(18846, 2)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

If you still want to use the categorical naive Bayes algorithm, you can binarize the pixel values by setting a threshold and converting each pixel value to 0 or 1 based on whether it is above or below the threshold. This will convert the continuous pixel values into binary features, which can then be used with the categorical naive Bayes algorithm. You can use the Binarizer class from scikit-learn to achieve this

In [47]:
binarizer = Binarizer(threshold=0.5)
X_train_binary = binarizer.transform(X_train)
X_test_binary = binarizer.transform(X_test)

In [48]:
# Train a Categorical Naive Bayes classifier
categorical_nb = CategoricalNB()
categorical_nb.fit(X_train_binary, y_train)

In [49]:
print(len(categorical_nb.feature_log_prob_))

64


In [50]:
print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train: (1257, 64)
Shape of y_train: (1257,)
Shape of X_test: (540, 64)
Shape of y_test: (540,)


In [51]:
y_pred = categorical_nb.predict(X_test_binary)

In [52]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 85.92592592592592 %


In [53]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       0.98      0.96      0.97        53
           1       0.85      0.56      0.67        50
           2       0.78      0.89      0.83        47
           3       0.88      0.81      0.85        54
           4       0.89      0.97      0.93        60
           5       0.96      0.76      0.85        66
           6       0.95      0.98      0.96        53
           7       0.89      1.00      0.94        55
           8       0.71      0.86      0.78        43
           9       0.72      0.80      0.76        59

    accuracy                           0.86       540
   macro avg       0.86      0.86      0.85       540
weighted avg       0.87      0.86      0.86       540



# Decision Tree Implementation
A decision tree is a tree-based model used for both classification and regression tasks. It is a non-parametric model that recursively splits the data into subsets based on the values of the input features until a stopping criterion is met. At each split, the model chooses the feature that best separates the data based on a splitting criterion, which can be either entropy or the Gini impurity. The decision tree then repeats this process for each subset until all the subsets are pure (i.e., they contain only one class).

In [70]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [71]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=50)

In [72]:
dtc = DecisionTreeClassifier(random_state=50, max_depth=2)

dtc.fit(X_train,y_train)

y_pred = dtc.predict(X_test)

In [73]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100, '%')

Accuracy: 95.55555555555556 %


In [67]:
print(f"Classifier Report : \n\n {classification_report(y_test, y_pred)}")

Classifier Report : 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        17
           2       0.93      0.93      0.93        14

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

