# IMPORTING THE LIBRARIES

In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from tensorflow import keras 

# READING THE CSV FILE

In [2]:
df = pd.read_csv("Spam Email raw text for NLP.csv")
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


**Dropping the unwanted columns**

In [3]:
df.drop(['FILE_NAME'],axis='columns',inplace=True)

In [5]:
df.head(10)

Unnamed: 0,CATEGORY,MESSAGE
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,1,This is a multi-part message in MIME format.\n...
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,1,This is the bottom line. If you can GIVE AWAY...
5,1,------=_NextPart_000_00B8_51E06B6A.C8586B31\n\...
6,1,"<STYLE type=""text/css"">\n\n<!--\n\nP{\n\n fon..."
7,1,<HR>\n\n<html>\n\n<head>\n\n <title>Secured I...
8,1,"<table width=""600"" border=""20"" align=""center"" ..."
9,1,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."


**Checking for imbalanced dataset**

In [6]:
df.CATEGORY.value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [7]:
df.CATEGORY.isnull().sum()

0

# DATA CLEANING

In [8]:
import re
def clean_msg(message):
    message = re.sub(r'http\S+|www.\S+' , '' ,message)  #for removing URLs
    
    message = re.sub(r'S+\@\S+\.\S+' , '' ,message)     #for removing mail-Id
    
    message = re.sub(r'<[^>]*>' , '' , message)         #for removing HTML tags
    
    message = re.sub(r'[^A-Za-z0-9 ]+', ' ', message)   #for removing special characters
    
    message = re.sub(r' +', ' ' , message)              #for removing extra spaces        
    
    message = message.lower()
    
    return message

In [9]:
df.MESSAGE = df.MESSAGE.apply(clean_msg)

In [10]:
df.head(10)

Unnamed: 0,CATEGORY,MESSAGE
0,1,dear homeowner interest rates are at their low...
1,1,attention this is a must for all computer user...
2,1,this is a multi part message in mime format ne...
3,1,important information the new domain names are...
4,1,this is the bottom line if you can give away c...
5,1,nextpart 000 00b8 51e06b6a c8586b31 content t...
6,1,hello this is chinese traditional 21 o nbsp n...
7,1,secured investements wealth without risk disc...
8,1,click here now simply amateur just like the g...
9,1,answer us nbsp unlist information this messag...


In [11]:
import spacy

ModuleNotFoundError: No module named 'spacy'

In [9]:
nlp = spacy.load('en_core_web_lg')

In [10]:
def remove_stop(text):
    doc = nlp(text)
    filtered_doc = []
    for tokens in doc:
        if tokens.is_stop or tokens.is_oov:
            continue
        filtered_doc.append(tokens.lemma_)
        
    return ' '.join(filtered_doc)

            

In [11]:
df.MESSAGE = df.MESSAGE.apply(remove_stop)

## DATA PREPROCESSING

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()

In [13]:
msg = df.MESSAGE.values.tolist()

In [14]:
tokenizer.fit_on_texts(msg)

In [15]:
len(tokenizer.word_index)

24538

In [16]:
input_text = tokenizer.texts_to_sequences(msg)

In [17]:
max_length = max(len(x) for x in input_text)
max_length

7719

In [18]:
padded_input = pad_sequences(input_text,maxlen=max_length,padding='post')

In [19]:
padded_input.shape

(5796, 7719)

In [20]:
y = df.CATEGORY
y.shape

(5796,)

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(padded_input,y,train_size = 0.7,random_state=40)

# MODEL TRAINING

In [22]:
model = keras.Sequential([
    keras.layers.Embedding(len(tokenizer.word_index)+1,output_dim=50,input_length=max_length),
    keras.layers.Flatten(),
    keras.layers.Dense(1,activation='sigmoid')
])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7719, 50)          1226950   
                                                                 
 flatten (Flatten)           (None, 385950)            0         
                                                                 
 dense (Dense)               (None, 1)                 385951    
                                                                 
Total params: 1612901 (6.15 MB)
Trainable params: 1612901 (6.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [25]:
model.fit(X_train,y_train , epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78cc5d06c6d0>

In [26]:
model.evaluate(X_test,y_test)



[0.032118137925863266, 0.9902242422103882]

# PREDICTION

In [27]:
def pred(text):
    input_text = tokenizer.texts_to_sequences([text])
    padded_input = pad_sequences(input_text,maxlen=max_length,padding='post')
    x = int(np.round(model.predict(padded_input)[0]))
    if x==0:
        print("Not Spam")
    else:
        print("Spam")

In [28]:
text = "free offer only for you" 
pred(text)

Spam


In [29]:
text = "we are going to watch the cricket match,are you coming or not , please tell us fast we are getting late" 
pred(text)

Not Spam
