In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords

In [3]:
raw_data = pd.read_csv('emails.csv')
data = raw_data.copy()
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
data.shape

(5728, 2)

In [5]:
data.isnull().sum()

text    0
spam    0
dtype: int64

In [6]:
data.drop_duplicates(inplace=True)
data.shape

(5695, 2)

### There were 33 duplicates, & now they are gone

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/msamaritan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def text_process(text):
    no_punc = [c for c in text if c not in string.punctuation]
    no_punc = ''.join(no_punc)
    
    final = [x for x in no_punc.split() if x.lower() not in stopwords.words('english')]
    return final

In [11]:
# Just the sample of the preprocessing

data['text'].head().apply(text_process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
final_text = CountVectorizer(analyzer = text_process).fit_transform(data['text'])

In [15]:
from sklearn.model_selection import train_test_split as tts

a_train, a_test, z_train, z_test = tts(final_text, data['spam'], test_size=0.2, random_state=143)

In [16]:
print(a_train.shape, z_train.shape)
print(a_test.shape, z_test.shape)

(4556, 37229) (4556,)
(1139, 37229) (1139,)


In [17]:
final_text.shape

(5695, 37229)

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
classifier = MultinomialNB().fit(a_train, z_train)

In [20]:
classifier.predict(a_train)

array([0, 0, 0, ..., 0, 0, 1])

In [22]:
z_train.values

array([0, 0, 0, ..., 0, 0, 1])

In [23]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [25]:
predictions = classifier.predict(a_train)
print("Classification Report \n\n", classification_report(z_train,predictions))

Classification Report 

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3462
           1       0.99      1.00      0.99      1094

    accuracy                           1.00      4556
   macro avg       1.00      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556



In [26]:
confusion_matrix(z_train, predictions)

array([[3455,    7],
       [   4, 1090]])

In [27]:
accuracy_score(z_train, predictions)

0.997585601404741

## Our Model is $99.75$ % Accurate

In [28]:
pred = classifier.predict(a_test)

In [29]:
confusion_matrix(z_test, pred)

array([[854,  11],
       [  2, 272]])

In [30]:
accuracy_score(z_test, pred)

0.9885864793678666

## Our Model performs $98.9$ % in the unseen data