# Spam mail detection using tensorflow

# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Reading the Data
Read in the spam_ham_dataset.csv file and set it to a data frame called df and checking the head of df.

In [2]:
df = pd.read_csv("spam_ham_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


**Using info and describe on df

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


# Checking whether missing values are present or not

In [5]:
missing_values=df.isnull().sum()
missing_values

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

# Removing Unwanted Columns in dataset.

In [6]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('label', axis=1)
df

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


# Splitting the data into training and test sets

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df['text']
y = df['label_num']
X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Training the model by tensorflow

In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
max_len = 100
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(sequences, padding='post', maxlen=max_len, truncating='post')
x_test_pad = pad_sequences(sequences, padding='post', maxlen=max_len, truncating='post')

In [11]:
vol = len(word_index) + 1
embedding_dim = 100

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vol, output_dim=embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model

In [14]:
history = model.fit(X_train_pad, Y_train, epochs=5, validation_split=0.2)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step - accuracy: 0.7291 - loss: 0.5039 - val_accuracy: 0.9686 - val_loss: 0.0866
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.9853 - loss: 0.0416 - val_accuracy: 0.9734 - val_loss: 0.0606
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.9987 - loss: 0.0081 - val_accuracy: 0.9746 - val_loss: 0.0630
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.9993 - loss: 0.0036 - val_accuracy: 0.9698 - val_loss: 0.0708
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 0.9734 - val_loss: 0.0595


In [15]:
predictions = model.predict(x_test_pad)
binary_predictions = (predictions > 0.5).astype(int)

[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [16]:
print(binary_predictions)

[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [0]]
