## Imports

In [164]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix,classification_report

import regex as re

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
from tensorflow.keras.layers import Embedding, Input, LSTM, Dense
from keras.models import Model
from tensorflow.keras.models import Sequential

## Load Data

In [165]:
# Load the data
df = pd.read_csv('input/emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [166]:
# Dropping the duplicate columns
df.drop_duplicates(inplace = True)

In [167]:
# Checking counts of spams and non-spams
df['spam'].value_counts()
df.text.head()

0    Subject: naturally irresistible your corporate...
1    Subject: the stock trading gunslinger  fanny i...
2    Subject: unbelievable new homes made easy  im ...
3    Subject: 4 color printing special  request add...
4    Subject: do not have money , get software cds ...
Name: text, dtype: object

## Preprocessing data (stopwords, punctuation & lemmatization)

In [168]:
def text_process(text):
  STOPWORDS = set(stopwords.words('english'))
  STOPWORDS.add('subject')
  lemmatizer = WordNetLemmatizer()
  
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = [word for word in text.split() if word.lower() not in STOPWORDS]
  return " ".join(text)

In [169]:
def tokenize(text):
    split=re.split("\W+",text)
    return split

In [170]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [171]:
#Show the processed data
df.text = df.text.apply(text_process)
df.text = df.text.apply(tokenize)
df.text = df.text.apply(lemmatize_words)
df.text.head()

0    naturally irresistible corporate identity lt r...
1    stock trading gunslinger fanny merrill muzo co...
2    unbelievable new home made easy im wanting sho...
3    4 color printing special request additional in...
4    money get software cd software compatibility g...
Name: text, dtype: object

## Vectorization of the text data

In [172]:
vocab_size = 10000
max_len = 250

tok = Tokenizer(num_words=vocab_size)
tok.fit_on_texts(df.text)
sequences = tok.texts_to_sequences(df.text)
sequences_matrix = pad_sequences(sequences, maxlen=max_len)

In [173]:
sequences_matrix[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 5188,  439, 1523, 3516,  379,
        808, 3947,   27,   29,  328, 5622,   26, 82

## TTS

In [174]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sequences_matrix, df.spam, test_size = 0.2, random_state = 1)

## Creating model

In [175]:
model = Sequential()
model.add(Embedding(vocab_size, 200, input_length=max_len))
model.add(LSTM(64))
model.add(Dense(1,activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 250, 200)          2000000   
                                                                 
 lstm_7 (LSTM)               (None, 64)                67840     
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,067,905
Trainable params: 2,067,905
Non-trainable params: 0
_________________________________________________________________


## Fit

In [176]:
model.fit(X_train,y_train,validation_data=(X_test,y_test), epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x25b0ad77550>

## Show scores

In [177]:
scores = model.evaluate(X_test, y_test, verbose=0)
predict_x=model.predict(X_test) 
y_pred=np.argmax(predict_x,axis=1)

print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
print('confusion matrix:\n', confusion_matrix(y_pred,y_test))
print(classification_report(y_test, y_pred))

Test loss: 0.05070885643362999
Test accuracy: 0.9850746393203735
confusion matrix:
 [[866 273]
 [  0   0]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86       866
           1       0.00      0.00      0.00       273

    accuracy                           0.76      1139
   macro avg       0.38      0.50      0.43      1139
weighted avg       0.58      0.76      0.66      1139

