# Variety of Naive Bayes algorithms and their application

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn import naive_bayes as nb
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Dataset from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [2]:
sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [3]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms_data.shape

(5572, 2)

In [5]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


## Prepare data

In [6]:
sms_data_clean = sms_data.copy()

In [7]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()

In [8]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()

In [9]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [10]:
sms_data_clean['SMS'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [11]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [12]:
le = preprocessing.LabelEncoder()

In [13]:
sms_data_clean['Label'] = le.fit_transform(sms_data_clean['Label'])

In [14]:
sms_data_clean['Label'].head()

0    0
1    0
2    1
3    0
4    0
Name: Label, dtype: int32

## Train/test split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sms_data_clean['SMS'], sms_data_clean['Label'], test_size=0.2, random_state=42)

## Vocabulary for training

In [16]:
vocabulary = list(set(X_train.sum()))

In [17]:
len(vocabulary)

7741

## Prepare data for training - calculate count of every word from the dictionary

In [18]:
X_train_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_train], columns=vocabulary)

In [19]:
X_train_voc.shape

(4457, 7741)

In [20]:
X_train_voc.head()

Unnamed: 0,box385,ga,womdarfull,pei,neighbors,store,yourjob,forced,é,complacent,...,buy,urgnt,frndship,assumed,rcd,worst,warm,linear,online,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X_test_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_test], columns=vocabulary)

In [22]:
X_test_voc.shape

(1115, 7741)

## 1. Gaussian Naive Bayes 

In [23]:
classif_gauss = nb.GaussianNB()

In [24]:
classif_gauss.fit(X_train_voc, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
res_gauss = classif_gauss.predict(X_test_voc)

In [26]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_gauss) * 100) )

Accuracy: 91.03%


## 2. Multinomial Naive Bayes

In [27]:
classif_multinom = nb.MultinomialNB()

In [28]:
classif_multinom.fit(X_train_voc, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
res_multinom = classif_multinom.predict(X_test_voc)

In [30]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_multinom) * 100) )

Accuracy: 99.19%


## 3. Complement Naive Bayes

In [31]:
classif_complement = nb.ComplementNB()

In [32]:
classif_complement.fit(X_train_voc, y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [33]:
res_complement = classif_complement.predict(X_test_voc)

In [34]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_complement) * 100) )

Accuracy: 98.12%


## 4. Bernoulli Naive Bayes

In [35]:
classif_bernoulli = nb.BernoulliNB()

In [36]:
classif_bernoulli.fit(X_train_voc, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [37]:
res_bernoulli = classif_bernoulli.predict(X_test_voc)

In [38]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_bernoulli) * 100) )

Accuracy: 98.30%


## 5. Categorical Naive Bayes

In [39]:
classif_cat = nb.CategoricalNB()

In [40]:
X_train_voc_cat = X_train_voc.applymap(lambda el: 1 if el > 0 else 0)

In [41]:
classif_cat.fit(X_train_voc_cat, y_train)

CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
X_test_voc_cat = X_test_voc.applymap(lambda el: 1 if el > 0 else 0)

In [43]:
res_cat = classif_cat.predict(X_test_voc_cat)

In [44]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_cat) * 100) )

Accuracy: 98.30%
