## IMPORTING LIBRARIES:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,precision_score
import re
import string


from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud


## LOADING DATA SETS:

In [16]:
df = pd.read_csv(r"spam_ham_dataset.xls")
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [19]:
df.size

20684

In [20]:
df.shape

(5171, 4)

In [21]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [22]:
df.drop(["Unnamed: 0","label"],axis=1,inplace=True)

In [23]:
df.shape

(5171, 2)

In [24]:
df.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


## PREPROCESSING TEXT DATA:

In [25]:
def wordopt(text): 
    #creating function to process text
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub("\\W"," ",text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]'% re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*','',text)
    return text

In [26]:
df['text']=df['text'].apply(wordopt)

In [27]:
X =df['text']
Y =df['label_num']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=18)

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_vocab = 10000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)

In [30]:
# tokenize the text into vectors 
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [31]:
import tensorflow as tf

In [32]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=256)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=256)

In [33]:
max_len=256
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_vocab, output_dim=32, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.build(input_shape=(None, max_len))
model.summary()



In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [35]:
history = model.fit(X_train, y_train, epochs=8, batch_size=32, validation_split=0.2)

Epoch 1/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 468ms/step - accuracy: 0.8120 - loss: 0.4496 - val_accuracy: 0.9432 - val_loss: 0.1226
Epoch 2/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 432ms/step - accuracy: 0.9625 - loss: 0.1197 - val_accuracy: 0.9469 - val_loss: 0.2311
Epoch 3/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 443ms/step - accuracy: 0.9873 - loss: 0.0580 - val_accuracy: 0.9589 - val_loss: 0.2445
Epoch 4/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 446ms/step - accuracy: 0.9912 - loss: 0.0294 - val_accuracy: 0.9589 - val_loss: 0.1859
Epoch 5/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 423ms/step - accuracy: 0.9970 - loss: 0.0139 - val_accuracy: 0.9626 - val_loss: 0.2634
Epoch 6/8
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 438ms/step - accuracy: 0.9994 - loss: 0.0073 - val_accuracy: 0.9626 - val_loss: 0.3404
Epoch 7/8
[1m10

In [36]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print test accuracy
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 130ms/step - accuracy: 0.9643 - loss: 0.3864
Test Accuracy: 96.43%
