# NLP Project: Spam eMail Detection with Naive Bayes Classifiers

In [41]:
import pandas as pd
import numpy as np
import seaborn as sns

In [42]:
df = pd.read_csv("emails.csv")
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [43]:
df["spam"].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [44]:
df.drop_duplicates(inplace=True)

In [45]:
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [46]:
df["spam"].value_counts()

spam
0    4327
1    1368
Name: count, dtype: int64

In [47]:
df.isnull().sum()

text    0
spam    0
dtype: int64

# separate x and y

In [48]:
x = df["text"]
y = df["spam"]

In [49]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [50]:
x

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5695, dtype: object

In [51]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5695, dtype: int64

In [52]:
x_train

5642    Subject: re : summer intern : paulo oliveira  ...
406     Subject: cheap oem soft shipping worldwide  wh...
560     Subject: you don _ t know how to get into sear...
5241    Subject: risk 2000 panel discussion  dear all ...
1129    Subject: mail delivery failed : returning mess...
                              ...                        
2375    Subject: color copier information  kevin ,  pl...
2078    Subject: 3 - d seismic data and oil trading  a...
946     Subject: double coverage amount , same payment...
923     Subject: 376 : unique - logos !  your business...
4107    Subject: benefits - personal days  good mornin...
Name: text, Length: 4556, dtype: object

# Data Preprocesing

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)

In [55]:
x_train_cv.toarray()

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4556, 33443))

# ML algorithom

In [56]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [57]:
model.fit(x_train_cv, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [58]:
x_test_cv = cv.transform(x_test)

In [59]:
model.score(x_test_cv, y_test)

0.9938542581211589

In [60]:
emails = ["Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize.",
          "Hey, are we still on for lunch tomorrow? Let me know what time works for you.",
          "Important notification about your bank account. Please verify your information immediately.",
          "Don't forget to submit your project report by the end of the day.",
          "Hey friend, long time no see! Let's catch up soon.",
          "You have been selected for a chance to get a new iPhone for just $1. Click now!"]
emails_cv = cv.transform(emails)

In [61]:
model.predict(emails_cv)

array([1, 0, 1, 1, 0, 1])