# Spam detector

#  This program detects if an email is spam or not

In [10]:
# Import libraries
import numpy as np
import pandas as pd 
import nltk 
from nltk.corpus import stopwords
import string

In [11]:
# Load the data
emails_v = pd.read_csv('emails/emails_v.csv', index_col=0)

In [12]:
emails_v

Unnamed: 0,spam_values,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [37]:
# Print the shape of the dataset
emails_v.shape

(5572, 2)

In [38]:
# Get the columns names
emails_v.columns

Index(['spam_values', 'text'], dtype='object')

In [39]:
# Check for duplicates and remove them
emails_v.drop_duplicates(inplace=True)

In [40]:
# Show the new shape ( number of rows and columns)
emails_v.shape

(5169, 2)

In [45]:
# Show the number of missing data for each column
emails_v.isnull().sum()

spam_values    0
text           0
dtype: int64

In [4]:
# Download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Anderson
[nltk_data]     Lomba\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
# Function that is going process the text
def process_text(text):
    # 1 remove punctuation
    # 2 remove stopwords
    # 3 return a list of clean text words
    
    # 1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    # 2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    # 3
    return clean_words

In [9]:
# Show the tokenization (a list of tokens also called lemmas )
emails_v['text'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: text, dtype: object

In [14]:
# Example 

message4 = 'hello world hello hello world play'
message5 = 'test test test test one hello'
print(message4)
print()

# Convert the text to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
bow4 = CountVectorizer(analyzer=process_text).fit_transform([[message4],[message5]])
print(bow4)
print()
print(bow4.shape)

hello world hello hello world play

  (0, 0)	3
  (0, 4)	2
  (0, 2)	1
  (1, 0)	1
  (1, 3)	4
  (1, 1)	1

(2, 5)


In [15]:
# Convert a collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(emails_v['text'])

In [16]:
# Split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, emails_v['spam_values'],test_size=0.20, random_state=0 )

In [17]:
# Get the shape of messages_bow
messages_bow.shape

(5572, 11301)

In [18]:
# Create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [20]:
# Print the predictions
print(classifier.predict(X_train))

# Print the actual values
print(y_train.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [26]:
# Evaluate the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_train, pred) )
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3876
        spam       0.98      0.98      0.98       581

    accuracy                           0.99      4457
   macro avg       0.99      0.99      0.99      4457
weighted avg       0.99      0.99      0.99      4457


Confusion Matrix: 
 [[3864   12]
 [  12  569]]

Accuracy:  0.9946152120260264


In [27]:
# Print the predictions
print(classifier.predict(X_test))

# Print the actual values
print(y_test.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [28]:
# Evaluate the model on the testing data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test,pred) )
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

         ham       0.99      0.96      0.98       949
        spam       0.82      0.93      0.87       166

    accuracy                           0.96      1115
   macro avg       0.90      0.95      0.92      1115
weighted avg       0.96      0.96      0.96      1115


Confusion Matrix: 
 [[915  34]
 [ 12 154]]

Accuracy:  0.9587443946188341
