# Spam eMail Detection with Naive Bayes Classifiers

In [1]:
# import library
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv("emails.csv")

In [3]:
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [4]:
df['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [7]:
df.isnull().sum()

text    0
spam    0
dtype: int64

# Separated x , y

In [8]:
x = df.text.values

In [9]:
y = df.spam.values

# Split Dataset

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = .20)

# Data Preprocessing 

In [12]:
from sklearn.feature_extraction.text import CountVectorizer


In [13]:
cv = CountVectorizer()
x_train = cv.fit_transform(xtrain)

In [14]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# ML Algorithm

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
model = MultinomialNB()
model.fit(x_train,ytrain)

MultinomialNB()

In [17]:
x_test = cv.transform(xtest)

In [18]:
x_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Testing data

In [19]:
model.score(x_test,ytest)

0.9894644424934153

In [20]:
test_emails = ['Hey i am junjun.i want to talk you about the machine learning..!','Hey, you get an iPhone from our company. please give me your address.' ]

In [21]:
cv_emails = cv.transform(test_emails)

In [22]:
model.predict(cv_emails)

array([0, 1], dtype=int64)

# Performance (Accuracy,Precision,Recall,F1 Score)

In [23]:
pred = model.predict(x_test)

In [24]:
pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [25]:
from sklearn.metrics import accuracy_score

In [27]:
accuracy_score(ytest,pred)

0.9894644424934153

In [28]:
from sklearn.metrics import confusion_matrix

In [29]:
confusion_matrix(ytest,pred)

array([[873,   8],
       [  4, 254]], dtype=int64)

In [30]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       881
           1       0.97      0.98      0.98       258

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

