# Simple Naive Bayes Spam Classifier

##### Spam email classifier based on Naive Bayes theorem, using NLTK and SciKit-Learn

Data Source: Kaggle (https://www.kaggle.com/balakishan77/spam-or-ham-email-classification/data)

In [1]:
# import and check data

import pandas as pd
import glob
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import string


df = pd.read_csv("emails.csv")
df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [2]:
# clean data

df.drop_duplicates(inplace=True)
df.isnull().sum()

text    0
spam    0
dtype: int64

In [3]:
# check cleaning

df.shape

(5695, 2)

In [4]:
# define function to remove stopwords from data

nltk.download('stopwords')

def preprocessor(data):
    """
    remove punctuations and stop words from text
    explanation: in data provided, check for punctuations, get cleaned list of words, feed that list to stopwords
    and return final list
    """
    return [word for word in (''.join([x for x in data if x not in string.punctuation])).split() if word not in stopwords.words('english')]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vihangbodh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# convert text to count tokens

counts = CountVectorizer(analyzer=preprocessor).fit_transform(df['text'])
counts.shape

(5695, 37229)

In [6]:
# split the data (I kept 75% for training)

X_train, X_test, y_train, y_test = train_test_split(counts, df['spam'], test_size = 0.25, random_state = 0)

In [7]:
# initialize and train the Naive Bayes model

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
# check if the model works fine

fake_test = model.predict(X_train)
print(classification_report(y_train, fake_test))
print("Confusion Matrix:\n", confusion_matrix(y_train, fake_test))
print("Accuracy:", accuracy_score(y_train, fake_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3243
           1       0.99      1.00      0.99      1028

    accuracy                           1.00      4271
   macro avg       0.99      1.00      1.00      4271
weighted avg       1.00      1.00      1.00      4271

Confusion Matrix:
 [[3232   11]
 [   1 1027]]
Accuracy: 0.9971903535471787


In [9]:
# feed it real data

real_test = model.predict(X_test)

In [10]:
# check final result

print(classification_report(y_test, real_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, real_test))
print("Accuracy:", accuracy_score(y_test, real_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1084
           1       0.97      0.99      0.98       340

    accuracy                           0.99      1424
   macro avg       0.98      0.99      0.99      1424
weighted avg       0.99      0.99      0.99      1424

Confusion Matrix:
 [[1073   11]
 [   3  337]]
Accuracy: 0.9901685393258427


In [11]:
# save the model

import pickle

pickle.dump(model, open("final_model.sav", 'wb'))

In [12]:
# uncomment the lines below to get the model file

#model_new = pickle.load(open("final_model.sav", 'rb'))
#print(model_new.score(X_test, y_test))