# The Spam Classification Problem

In this notebook, we will use the Naive Bayes algorithm to build a spam filter that classifies SMS messages as spam or non-spam.

 To train the algorithm, we will use a dataset of 5,572 SMS messages that are already classified by humans.

Main goal: Build ANN model to recognize spam messages.

> Feature Variables
> - SMS message

> Target class: Message type
> - Spam or Ham


In [None]:
# Importing libraries
import nltk
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')


In [None]:
## Input preprocessing

# 1. Cleansing
# 2. Stop word removal
# 3. Lemmatization
# 4. Numeric representation ->  TF-IDF vs word embeddings 

lemmatizer = WordNetLemmatizer()

In [None]:
#Load Spam Data and review content
spam_data = pd.read_csv("Spam-Classification.csv")


#Separate feature and target data
spam_classes_raw = spam_data["CLASS"]
spam_messages = spam_data["SMS"]

In [None]:
#Custom tokenizer to remove stopwords and use lemmatization
def custom_tokenize(string):
    #Split string as tokens
    tokens = nltk.word_tokenize(string)
    #Filter for stopwords
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    #Perform lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop ]
    return lemmatized

In [None]:
from sklearn import preprocessing
from tensorflow.keras import utils as k_utils

#Build a TF-IDF Vectorizer model
vectorizer = TfidfVectorizer(tokenizer=custom_tokenize)

#Transform feature input to TF-IDF
tfidf=vectorizer.fit_transform(spam_messages)
#Convert TF-IDF to numpy array
tfidf_array = tfidf.toarray()

#Build a label encoder for target variable to convert strings to numeric values.
label_encoder = preprocessing.LabelEncoder()
spam_classes = label_encoder.fit_transform(spam_classes_raw)

#Convert target to one-hot encoding vector
spam_classes = k_utils.to_categorical(spam_classes,2)

print("TF-IDF Matrix Shape : ", tfidf.shape)
print("One-hot Encoding Shape : ", spam_classes.shape)

In [None]:
# split data into training and test sets
X_train,X_test,Y_train,Y_test = train_test_split(tfidf_array, spam_classes, test_size=0.10)


In [None]:
# Building the ANN Model

NB_CLASSES=2
N_HIDDEN=32

model = tf.keras.models.Sequential()

model.add(keras.layers.Dense(N_HIDDEN,
                             input_shape=(X_train.shape[1],),
                              name='Hidden-Layer-1',
                              activation='relu'))

model.add(keras.layers.Dense(N_HIDDEN,
                              name='Hidden-Layer-2',
                              activation='relu'))

model.add(keras.layers.Dense(NB_CLASSES,
                             name='Output-Layer',
                             activation='softmax'))

model.compile(loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# Training the Model

#Make it verbose so we can see the progress
VERBOSE=1

#Setup Hyper Parameters for training
BATCH_SIZE=256
EPOCHS=10
VALIDATION_SPLIT=0.2

print("\nTraining Progress:\n------------------------------------")

history=model.fit(X_train,
          Y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=VERBOSE,
          validation_split=VALIDATION_SPLIT)

print("\nAccuracy during Training :\n------------------------------------")
import matplotlib.pyplot as plt

pd.DataFrame(history.history)["accuracy"].plot(figsize=(8, 5))
plt.title("Accuracy improvements with Epoch")
plt.show()

print("\nEvaluation against Test Dataset :\n------------------------------------")
model.evaluate(X_test,Y_test)

In [None]:
# Model Evaluation (predict for text)

#Predict for multiple samples using batch processing

#Convert input into IF-IDF vector using the same vectorizer model
predict_tfidf=vectorizer.transform(["FREE entry to a fun contest",
                                    "Yup I will come over"]).toarray()

print(predict_tfidf.shape)

#Predict using model
prediction=np.argmax( model.predict(predict_tfidf), axis=1 )
print("Prediction Output:" , prediction)

#Print prediction classes
print("Prediction Classes are ", label_encoder.inverse_transform(prediction))