In [1]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from nltk.corpus import wordnet
import string

In [2]:
df = pd.read_csv("emails.csv")
df = df.iloc[:, :2]
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5727, 2)

In [4]:
df['spam'].unique()

array([1, 0], dtype=int64)

In [5]:
df.drop_duplicates(inplace = True)
df.shape

(5694, 2)

In [6]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [7]:

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    #nltk.download('averaged_perceptron_tagger')
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#Tokenization (a list of tokens), will be used as the analyzer
#1.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]
#2.Stop words in natural language processing, are useless words (data).
def process_text(text):
    
    #1 Remove Punctuationa
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2 Remove Stop Words
    words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3 Lemmatize
    lemmatizer = WordNetLemmatizer()
    clean_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    #nlp = spacy.load('en_core_web_sm' , disable=['parser', 'ner'])
    #doc = nlp(nopunc)
    #clean_words = " ".join([token.lemma_ for token in doc])
    
    #4 Return a list of clean words
    return clean_words

In [8]:
#Show the Tokenization (a list of tokens )
#nltk.download('stopwords')
#nltk.download('wordnet')
df['text'].head().apply(process_text)
#process_text(df['text'][0])

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, home, make, easy,...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cd, software, ...
Name: text, dtype: object

In [10]:
# Feature Extraction Bag Of Words
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['text'])
#messages_bow

In [11]:
#Split data into 80% training & 20% testing data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'], test_size = 0.20, random_state = 0)

In [12]:
#Get the shape of messages_bow
messages_bow.shape

(5694, 32335)

In [13]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))

print('\nAccuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3452
           1       0.99      1.00      0.99      1103

   micro avg       1.00      1.00      1.00      4555
   macro avg       0.99      1.00      0.99      4555
weighted avg       1.00      1.00      1.00      4555

Confusion Matrix: 
 [[3436   16]
 [   3 1100]]

Accuracy:  0.9958287596048299


In [15]:
#Evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99       874
           1       0.97      0.99      0.98       265

   micro avg       0.99      0.99      0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

Confusion Matrix: 
 [[866   8]
 [  2 263]]

Accuracy:  0.9912203687445127
