In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

### Loading Data

In [51]:
data = pd.read_csv('emails.csv')

In [52]:
data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [53]:
data.shape            #5172 rows and 3002 columns

(5172, 3002)

### Checking Null values

In [54]:
print(data.isnull().sum())
data.fillna(0 , inplace=True)

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64


In [55]:
data.dtypes

Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object

In [56]:
data.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


### Shuffling Data

In [57]:
from sklearn.utils import shuffle
data = shuffle(data)
data.reset_index(inplace=True, drop=True) 

### Droping unecessary Columns

In [58]:
data = data.drop(labels = ['Email No.'] , axis = 1)
print(data)

      the  to  ect  and  for  of    a  you  hou  in  ...  connevey  jay  \
0       8   3   13    4    2   4   45    1    7   3  ...         0    0   
1       5  10    2    8    2   0   63    1    0  14  ...         0    0   
2       9   8    2    2    6   2   65    3    0   9  ...         0    0   
3       9   8    4    4    2   1   73    2    1  16  ...         0    0   
4       7   3    1    5    1  12   58   12    0   5  ...         0    0   
...   ...  ..  ...  ...  ...  ..  ...  ...  ...  ..  ...       ...  ...   
5167    4   1    1    1    0   1   17    1    0   1  ...         0    0   
5168    9   6   96    8    3   2  122    0   49  20  ...         0    0   
5169    3   2    1    8    3   2   34    1    0  13  ...         0    0   
5170    9   8    3    6    5   3   49    5    1  12  ...         0    0   
5171   11   6   17    1    1   3   66    2   10  21  ...         0    0   

      valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0          0    0     

### Changed all columns to lowercase

In [59]:
data.columns = data.columns.str.lower()

### Changed datatypes to float

In [60]:
data = data.astype('float32')

In [61]:
data.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,prediction
0,8.0,3.0,13.0,4.0,2.0,4.0,45.0,1.0,7.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,10.0,2.0,8.0,2.0,0.0,63.0,1.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.0,8.0,2.0,2.0,6.0,2.0,65.0,3.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,8.0,4.0,4.0,2.0,1.0,73.0,2.0,1.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,3.0,1.0,5.0,1.0,12.0,58.0,12.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0


In [62]:
x = data.drop(labels = 'prediction' , axis = 1)
y = data[['prediction']]

### Spliting data into training, validation and test datasets

In [63]:
train_data , test_data , train_labels , test_labels = train_test_split(x, y, train_size = 0.7, test_size = 0.3, random_state = 42)

In [64]:
sc = StandardScaler()
train_data = sc.fit_transform(train_data)         #Standarized the input data

In [65]:
test_data = sc.fit_transform(test_data)

In [66]:
rem_train_data, val_data, rem_train_labels, val_labels = train_test_split(train_data, train_labels, 
                                                                          test_size = 0.2, random_state = 42)

In [67]:
print(rem_train_data.shape)
print(val_data.shape)
print(test_data.shape)

(2896, 3000)
(724, 3000)
(1552, 3000)


In [52]:
""""def getDuplicateColumns(df):
  
    duplicateColumnNames = set()

    for x in range(df.shape[1]):
      
        col = df.iloc[:, x]
                                                                         # CODE For finding duplicate [equal] columns
        for y in range(x + 1, df.shape[1]):
              
            otherCol = df.iloc[:, y]
              
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])

    return list(duplicateColumnNames)"""


In [56]:
""""dup_tain = getDuplicateColumns(rem_train_data)
dup_val = getDuplicateColumns(val_data)
dup_test = getDuplicateColumns(test_data)"""

In [50]:
#rem_train_data.drop(labels = dup_tain , axis = 1 , inplace = True)

In [63]:
#val_data.drop(labels = dup_val , axis = 1 , inplace = True)

In [64]:
#test_data.drop(labels = dup_test , axis = 1 , inplace = True)

In [68]:
print(rem_train_data.shape)
print(val_data.shape)
print(test_data.shape)
print(rem_train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(2896, 3000)
(724, 3000)
(1552, 3000)
(2896, 1)
(724, 1)
(1552, 1)


### Building Model

In [69]:
model = models.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape = (rem_train_data.shape[1],)))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(6, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [70]:
model.compile(optimizer = 'rmsprop',
             loss = 'binary_crossentropy', 
             metrics = ['accuracy'])

In [71]:
Result = model.fit(rem_train_data, rem_train_labels, epochs = 100, batch_size = 128, validation_data = (val_data , val_labels))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Evaluating Model

In [72]:
Evaluation = model.evaluate(test_data, test_labels, batch_size = 32)
print(Evaluation) 

[0.20103719830513, 0.9587628841400146]


In [73]:
Prediction = model.predict(test_data)

In [74]:
Prediction.shape

(1552, 1)

In [75]:
a = np.where(Prediction < 0.5 , 0 , 1)
print("Predicted Shape : ",a.shape, " Original Shape : ", test_labels.shape)
print("----------------------------------------------------------------")
print("Predicted Labels\n", a , "Original Labels\n" , test_labels)

Predicted Shape :  (1552, 1)  Original Shape :  (1552, 1)
----------------------------------------------------------------
Predicted Labels
 [[1]
 [0]
 [1]
 ...
 [1]
 [1]
 [0]] Original Labels
       prediction
1566         1.0
1988         0.0
1235         1.0
3276         0.0
3438         0.0
...          ...
3558         0.0
2298         0.0
1519         1.0
1740         1.0
1700         0.0

[1552 rows x 1 columns]


In [77]:
b = np.where(a == 1, 'Spam', 'Not-Spam')
print(b)
c = np.where (a == test_labels , 'Matched' , 'Not Matched')
print("-------------------------------------")
print(c)
print("Prediction match with original labels for :                   ",  len(c[c == 'Matched']))
print("Prediction does not match with original labels for :          ", len(c[c == 'Not Matched']))

[['Spam']
 ['Not-Spam']
 ['Spam']
 ...
 ['Spam']
 ['Spam']
 ['Not-Spam']]
-------------------------------------
[['Matched']
 ['Matched']
 ['Matched']
 ...
 ['Matched']
 ['Matched']
 ['Matched']]
Prediction match with original labels for :                    1488
Prediction does not match with original labels for :           64


In [80]:
Prediction_percentage = (len(c[c == 'Matched'])/test_labels.shape[0])*100
print("Prediction Percentage : ", Prediction_percentage)

Prediction Percentage :  95.87628865979381


## Result

##### Our model predicted 1488 emails correctly from 1552 emails whether they were spam emails or not-spam. It showed accuracy of almost 96%