In [1]:
!pip install nltk scikit-learn regex numpy pandas



# Import Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Input 

In [3]:
df = pd.read_csv('./Spam_Email_raw_text_for_NLP.csv')

In [4]:
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [5]:
df.drop('FILE_NAME', axis = 1, inplace=True)

In [6]:
df.head()

Unnamed: 0,CATEGORY,MESSAGE
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,1,This is a multi-part message in MIME format.\n...
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,1,This is the bottom line. If you can GIVE AWAY...


In [7]:
df.CATEGORY.value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

# Pre-processing Data With NLTK(NLP)

In [8]:
stopwords = stopwords.words("english")
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
nltk.download("wornet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Error loading wornet: Package 'wornet' not found in index


In [11]:
corpus = []

In [12]:
for i in range(len(df)):
    # removing all non-alphanumeric characters
    message = re.sub('[^a-zA-Z0-9]'," ",df['MESSAGE'][i])
    
    # converting the message to lowercase
    message = message.lower()
    
    # splitting the sentence inot words for lemmatization
    message = message.split()
    
    # removing stopwords and lemmatizing
    message = [lemmatizer.lemmatize(word) for word in message if word not in stopwords]
    
    # converting the words back into sentences
    message = ' '.join(message)
    
    # adding the preprocessing message to the corpus list
    corpus.append(message)

# Feature Engineering Using the Bag-of-Words Model vs TF-IDF Technique

# Bag-Of-Words Model

In [39]:
'''
cv = CountVectorizer(max_features=2500, ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()
y = df["CATEGORY"]
'''

'\ncv = CountVectorizer(max_features=2500, ngram_range=(1,3))\nX = cv.fit_transform(corpus).toarray()\ny = df["CATEGORY"]\n'

In [40]:
'''
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)
'''

'\nx_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)\n'

# TF-IDF Technique

In [41]:
tf = TfidfVectorizer(ngram_range=(1,3), max_features=2500)
X = tf.fit_transform(corpus).toarray()

x_train, x_test, y_train, y_test = train_test_split(
  X, y, test_size=0.33, random_state=1)

# Creating and Training Your Model

In [42]:
model = MultinomialNB()

In [43]:
model.fit(x_train, y_train)

MultinomialNB()

In [44]:
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

# Model Evaluation

In [45]:
accuracy_model = accuracy_score(train_pred,y_train)

In [46]:
print(accuracy_model)

0.9636878702034509


In [47]:
accuracy_model_test = accuracy_score(test_pred,y_test)

In [48]:
print(accuracy_model_test)

0.9508625196027183


In [49]:
print(classification_report(train_pred, y_train))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2714
           1       0.90      0.98      0.94      1169

    accuracy                           0.96      3883
   macro avg       0.95      0.97      0.96      3883
weighted avg       0.97      0.96      0.96      3883



In [50]:
print(classification_report(test_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      1367
           1       0.86      0.98      0.92       546

    accuracy                           0.95      1913
   macro avg       0.93      0.96      0.94      1913
weighted avg       0.96      0.95      0.95      1913



# Results of Classifying Your Own Messages

In [54]:
def result(message):
    print('Predicting...')
    message_vector = tf.transform(message)
    category = model.predict(message_vector)
    print("The message is", "spam" if category == 1 else "not spam")

In [55]:
message1 = ["You won 10000 dollars, please provide your account details,So that we can transfer the money"]
result(message1)

Predicting...
The message is spam


In [56]:
message2 = ["hey racheal, the meeting is postponed to monday"]
result(message2)

Predicting...
The message is not spam
