In [6]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re
import string
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [8]:
emails = pd.read_csv("https://raw.githubusercontent.com/musakanneh/spam_classifier/main/spam_or_not_spam.csv", encoding='latin')

In [9]:
emails.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [10]:
# Converting "email" data_type to "Object"
emails['email'] = emails['email'].astype('str')

    # Data Split

In [11]:
from sklearn.model_selection import train_test_split

X = emails['email']
y = emails['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Data Preprocessing

In [12]:
# Functions to remove hyperlink, number, punctuations, white space, new_lines, and make the text lower.

def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)

def to_lower(word):
    result = word.lower()
    return result

def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result

def remove_punctuation(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n','')

def clean_up_pipeline(sentence):
    cleaning_utils = [to_lower,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,remove_whitespace]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

In [13]:
x_train = [clean_up_pipeline(email) for email in X_train]
x_test = [clean_up_pipeline(email) for email in X_test]

        Tokenizing the emails

In [14]:
# Configuration values
 
embed_size = 100 # Size of each word vector
max_feature = 50000 # Number of unique words to use
max_len = 2000 # Max number of words to use

In [15]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_feature)

tokenizer.fit_on_texts(X_train)

X_train_features = np.array(tokenizer.texts_to_sequences(X_train))
X_test_features = np.array(tokenizer.texts_to_sequences(X_test))

        Padding

In [16]:
from keras.preprocessing.sequence import pad_sequences

X_train_features = pad_sequences(X_train_features, maxlen=max_len)
X_test_features = pad_sequences(X_test_features, maxlen=max_len)

In [17]:
X_train_features[0]

array([    0,     0,     0, ...,   347, 11438,   230], dtype=int32)

    # Model Training

In [18]:
# Necessary Imports for Keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional
from keras.models import Model

# create the model
import tensorflow as tf
embedding_vecor_length = 32

model = tf.keras.Sequential()
model.add(Embedding(max_feature, embedding_vecor_length, input_length=max_len))
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2000, 32)          1600000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49664     
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,651,745
Trainable params: 1,651,745
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(X_train_features, y_train, batch_size=512, epochs=20, validation_data=(X_test_features, y_test))

Epoch 1/20
1/5 [=====>........................] - ETA: 0s - loss: 0.6888 - accuracy: 0.8340

      Visualizing Model Accuracy

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.grid()
plt.show()

In [None]:
preds  = [1 if o > 0.5 else 0 for o in model.predict(X_test_features)]

In [None]:
confn_matrix = confusion_matrix(y_test, preds)

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

print("Precision: {:.2f}%".format(100 * precision_score(y_test, preds)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, preds)))
print("F1 Score: {:.2f}%".format(100 * f1_score(y_test, preds)))

In [None]:
f1_score(y_test, preds)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

ax= plt.subplot()
sns.heatmap(confn_matrix, annot=True, ax = ax,cmap='Blues',fmt=''); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);