# IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,precision_score
import re
import string
import tensorflow as tf


from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

# LOADING DATASETS

In [2]:
data=pd.read_csv("spam_ham_dataset.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
data.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


In [5]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [6]:
data.shape

(5171, 4)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
data.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [9]:
data=data.drop(['Unnamed: 0'],axis=1)
data

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [10]:
data=data.sample(frac=1)

In [11]:
data

Unnamed: 0,label,text,label_num
4922,spam,Subject: hp psc 1315 all - in - one @ $ 69 . 0...,1
1745,spam,Subject: re : start dating\r\ndaily update :\r...,1
3420,ham,Subject: upcoming sitara risk assignment relea...,0
349,ham,Subject: organizational announcement\r\nplease...,0
4627,ham,Subject: training resources and cost to attend...,0
...,...,...,...
2727,ham,Subject: buyback deals - - january 2000\r\natt...,0
1955,spam,Subject: discover you made money while you wer...,1
19,ham,Subject: additional recruiting\r\ni ' m happy ...,0
3569,spam,Subject: strong buy alert : weekly member news...,1


In [12]:
data.to_csv("news_datas")

In [13]:
data.reset_index(inplace=True)
data.drop(['index'],axis=1,inplace=True)
data

Unnamed: 0,label,text,label_num
0,spam,Subject: hp psc 1315 all - in - one @ $ 69 . 0...,1
1,spam,Subject: re : start dating\r\ndaily update :\r...,1
2,ham,Subject: upcoming sitara risk assignment relea...,0
3,ham,Subject: organizational announcement\r\nplease...,0
4,ham,Subject: training resources and cost to attend...,0
...,...,...,...
5166,ham,Subject: buyback deals - - january 2000\r\natt...,0
5167,spam,Subject: discover you made money while you wer...,1
5168,ham,Subject: additional recruiting\r\ni ' m happy ...,0
5169,spam,Subject: strong buy alert : weekly member news...,1


In [14]:
data.shape

(5171, 3)

In [15]:
data.describe()

Unnamed: 0,label_num
count,5171.0
mean,0.289886
std,0.453753
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Preprocessing Text Data

In [16]:
def wordopt(text): 
    #creating function to process text
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub("\\W"," ",text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]'% re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*','',text)
    return text

In [17]:
data['text']=data['text'].apply(wordopt)

In [18]:
features = data['text']
targets = data['label_num']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.20, random_state=18)

In [20]:
type(x_train)

pandas.core.series.Series

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_vocab = 100000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(x_train)

In [22]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [23]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=256)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, padding='post', maxlen=256)

In [24]:
max_len=256
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_vocab, output_dim=32, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])


model.build(input_shape=(None, max_len))
model.summary()



In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
type(x_train)

numpy.ndarray

In [27]:
history = model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 333ms/step - accuracy: 0.5187 - loss: 5.7877 - val_accuracy: 0.3273 - val_loss: 10.7245
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 305ms/step - accuracy: 0.5124 - loss: 5.2035 - val_accuracy: 0.7597 - val_loss: 0.2735
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 297ms/step - accuracy: 0.9383 - loss: 0.1887 - val_accuracy: 0.9710 - val_loss: 0.1487
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 282ms/step - accuracy: 0.9897 - loss: 0.0984 - val_accuracy: 0.9771 - val_loss: 0.1502
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 291ms/step - accuracy: 0.9937 - loss: 0.0876 - val_accuracy: 0.9746 - val_loss: 0.2745


In [29]:
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 73ms/step - accuracy: 0.9556 - loss: 0.5672 
Test Accuracy: 95.56%
