# import required libraries

In [1]:
import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import time

# Load data:

In [4]:
df= pd.read_csv('emails.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


# Preprocessing:

In [5]:
df.shape

(6046, 3)

In [6]:
df = df[df.columns[1:]]
df.shape

(6046, 2)

In [7]:
df.drop_duplicates(inplace = True)
df.shape

(5294, 2)

In [8]:
df.isnull().sum()

Body     1
Label    0
dtype: int64

In [9]:
df=df.dropna()
df.isnull().sum()

Body     0
Label    0
dtype: int64

In [10]:
print(df.columns)

Index(['Body', 'Label'], dtype='object')


In [11]:
#nltk.download('stopwords')
#nltk.download('punkt')

In [12]:
def process_text(Body):
    nopunc = []
    for char in Body:
        if char not in string.punctuation:
            nopunc.append(char)
    nopunc= ''.join(nopunc)
    tokens = word_tokenize(nopunc.lower())
    clean_word = []
    for word in tokens:
        if word.lower() not in stopwords.words('english'):
            clean_word.append(word)

    return clean_word

In [13]:
df.Body.head().apply(process_text)

0    [save, 70, life, insurance, spend, tolife, quo...
1    [1, fight, risk, cancer, httpwwwadclickwspcfmo...
2    [1, fight, risk, cancer, httpwwwadclickwspcfmo...
3    [adult, club, offers, free, membership, instan...
4    [thought, might, like, 1, slim, guaranteed, lo...
Name: Body, dtype: object

# Convert Text to Numerics

In [14]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df.Body)
y = df.Label
x.shape

(5293, 70237)

In [15]:
print(x.toarray()[:1])

[[0 0 0 ... 0 0 0]]


# split data set:

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=0)

In [17]:
subsample_size = 7000
n_features_selected = 700

# Select a random subset of features
subsample_features = np.random.choice(range(x_train.shape[1]), size=subsample_size, replace=True)

x_train_subsampled = x_train[:, subsample_features]
x_test_subsampled = x_test[:, subsample_features]

# count spam/non_spam values

In [18]:
spam_count = 0
non_spam_count = 0
no_label = 0
for label in y_train:
    if label == 0:
        spam_count += 1
    elif label == 1:
        non_spam_count += 1
    else:
        no_label += 1
    
print("spam = ",spam_count)
print("non_spam = ",non_spam_count)
print("no_label = ",no_label)

spam =  2728
non_spam =  977
no_label =  0


# Apply BFFS to select the best features


start_time=time.time()
selector = SelectKBest(f_classif, k=n_features_selected)  
selector.fit(x_train_subsampled, y_train)

x_train_selected = selector.transform(x_train_subsampled)
x_test_selected = selector.transform(x_test_subsampled)
print("done")
end_time=time.time()
exec_time=end_time-start_time
print("exec time =",exec_time)

# Train and Test using 3 different classifiers:

In [20]:
# Train and test a NB classifier on the selected features
clf = MultinomialNB()
start_time = time.time()
clf.fit(x_train_selected, y_train)
pred = clf.predict(x_test_selected)
accuracy1 = accuracy_score(y_test, pred)

print("accuracy(in %): ", accuracy1*100)
print()
# create confusion matrix
conf_matrix1 = confusion_matrix(y_test, pred)
end_time = time.time()
exec_time = end_time - start_time
print("execution time: ",exec_time)

accuracy(in %):  91.12090680100756

execution time:  0.007998466491699219


In [21]:
# train a Bernoulli Naive Bayes classifier on selected features
clf = BernoulliNB()
start_time = time.time()
clf.fit(x_train_selected, y_train)

# predict the class labels for the test set
pred = clf.predict(x_test_selected)

# evaluate the performance of the classifier using accuracy
accuracy2 = accuracy_score(y_test, pred)
print("Accuracy:", accuracy2*100)
print()
# create confusion matrix
conf_matrix2 = confusion_matrix(y_test, pred)

end_time = time.time()
exec_time = end_time - start_time
print("execution time: ",exec_time)

Accuracy: 90.99496221662469

execution time:  0.007997751235961914


In [22]:
# train the J48 classifier on selected features
clf = DecisionTreeClassifier(random_state=42)
start_time = time.time()
clf.fit(x_train_selected, y_train)

# use the classifier to make predictions on the testing set
pred = clf.predict(x_test_selected)

# evaluate the accuracy of the classifier
accuracy3 = accuracy_score(y_test, pred)
print(f"Accuracy: ",accuracy3*100)
print()
# create confusion matrix
conf_matrix3 = confusion_matrix(y_test, pred)

end_time = time.time()
exec_time = end_time - start_time
print("execution time: ",exec_time)

Accuracy:  88.16120906801007

execution time:  0.18604493141174316


# Accuracy and confusion matrix comparision:

In [23]:
print("accuracy with MultinomialNB: ",accuracy1*100)
print("accuracy with BernoulliNB:   ", accuracy2*100)
print("accuracy with J48:           ", accuracy3*100)

accuracy with MultinomialNB:  91.12090680100756
accuracy with BernoulliNB:    90.99496221662469
accuracy with J48:            88.16120906801007


In [24]:
print("confusion matrix of MultinomialNB:\n",conf_matrix1)
print("confusion matrix of BernoulliNB:\n",conf_matrix2)
print("confusion matrix of J48:\n",conf_matrix3)

confusion matrix of MultinomialNB:
 [[1144   43]
 [  98  303]]
confusion matrix of BernoulliNB:
 [[1165   22]
 [ 121  280]]
confusion matrix of J48:
 [[1087  100]
 [  88  313]]
