In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
from google.colab import files
uploaded = files.upload()

Saving email_spam.csv to email_spam.csv


In [3]:
df = pd.read_csv('email_spam.csv')

df.head(5)

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [5]:
df.shape

(84, 3)

In [6]:
df.drop_duplicates(inplace=True) #Removing the missing Values

In [7]:
df.shape

(83, 3)

In [8]:
df.isnull().sum()

title    0
text     0
type     0
dtype: int64

In [9]:
#Download stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
def process_text(text):

  #Remove Punctuation
  nopunc =[char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #Remove Stopwords
  clean_word = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  return clean_word

In [14]:
df['text'].head().apply(process_text)  #Shows the Tokenization

0    [Hi, James, claim, complimentary, gift, yet, I...
1    [alttext, Congratulations, earned, 500, comple...
2    [Heres, GitHub, launch, code, Mortyj420, octoc...
3    [Hello, Thank, contacting, Virtual, Reward, Ce...
4    [Hey, Prachanda, Rawal, Todays, newsletter, Ja...
Name: text, dtype: object

In [16]:
# Convert a collection of text into matrix of tokens

from sklearn.feature_extraction.text import CountVectorizer  # For converting text to matrix of token counts

message_bow = CountVectorizer(analyzer = process_text).fit_transform(df['text'])


In [18]:
#Split data into 80% Training and 20% Testing
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(message_bow , df['type'] , test_size = 0.20 , random_state=0)

In [21]:
#Get shape of Message_bow ( Bag of words)
message_bow.shape

(83, 3201)

In [22]:
#Create and train naive bayes classifier

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train,y_train)


In [23]:
#Print the predictions

print(classifier.predict(X_train))

['not spam' 'not spam' 'spam' 'not spam' 'spam' 'spam' 'spam' 'spam'
 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam' 'not spam'
 'spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'spam' 'spam' 'spam' 'not spam'
 'not spam' 'spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'spam' 'not spam' 'spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam']


In [24]:
#Print actual values just for cross-checking
print(y_train.values)

['not spam' 'not spam' 'spam' 'not spam' 'spam' 'spam' 'spam' 'spam'
 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam' 'not spam'
 'spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'spam' 'spam' 'spam' 'not spam'
 'not spam' 'spam' 'not spam' 'not spam' 'spam' 'not spam' 'spam'
 'not spam' 'not spam' 'spam' 'not spam' 'spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam' 'spam' 'not spam' 'not spam' 'not spam'
 'not spam' 'not spam' 'not spam']


In [25]:
#Evaluate model on training dataset

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pred = classifier.predict(X_train)

In [27]:
accuracy = accuracy_score(y_train, pred)
conf_matrix = confusion_matrix(y_train, pred)
clasf_report = classification_report(y_train,pred)

In [29]:
print(f"Accuracy: {accuracy}")
print()
print(f"Confusion Matrix:\n{conf_matrix}")
print()
print(f"Classification Report:\n{clasf_report}")

Accuracy: 0.9696969696969697

Confusion Matrix:
[[47  0]
 [ 2 17]]

Classification Report:
              precision    recall  f1-score   support

    not spam       0.96      1.00      0.98        47
        spam       1.00      0.89      0.94        19

    accuracy                           0.97        66
   macro avg       0.98      0.95      0.96        66
weighted avg       0.97      0.97      0.97        66



In [32]:
#Evaluate Model on Test Dataset

pred_test = classifier.predict(X_test)

In [33]:
accuracy_t = accuracy_score(y_test, pred_test)
conf_matrix_t = confusion_matrix(y_test, pred_test)
clasf_report_t = classification_report(y_test,pred_test)

In [35]:
print(f"Accuracy: {accuracy_t}")
print()
print(f"Confusion Matrix:\n{conf_matrix_t}")
print()
print(f"Classification Report:\n{clasf_report_t}")

Accuracy: 0.6470588235294118

Confusion Matrix:
[[6 4]
 [2 5]]

Classification Report:
              precision    recall  f1-score   support

    not spam       0.75      0.60      0.67        10
        spam       0.56      0.71      0.63         7

    accuracy                           0.65        17
   macro avg       0.65      0.66      0.65        17
weighted avg       0.67      0.65      0.65        17

