# Imports

In [2]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading the dataset

In [3]:
#Load Spam Data and review content
spam_data = pd.read_csv("Spam-Classification.csv")

print("\nLoaded Data :\n------------------------------------")
print(spam_data.head())


Loaded Data :
------------------------------------
  CLASS                                                SMS
0   ham   said kiss, kiss, i can't do the sound effects...
1   ham      &lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2  spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3  spam  * FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4  spam  **FREE MESSAGE**Thanks for using the Auction S...


# Separate input features and output

In [4]:
#Separate feature and target data
spam_classes_raw = spam_data["CLASS"]
spam_messages = spam_data["SMS"]

# Preprocessing

In [8]:
# Installing the necessary text pre-processing libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mirza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mirza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mirza\AppData\Roaming\nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mirza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#Custom tokenizer to remove stop words and to use the lemmitization
def customtokenize(str):
    # Converting the string into tokens
    tokens = nltk.word_tokenize(str)
    # Filtering of stop words
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    #Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized
    

In [10]:
# Building a TF-IDF vectorizer model which is an NLP technique
vectorizer = TfidfVectorizer(tokenizer=customtokenize)
#Converting input into TF-IDF
tfidf=vectorizer.fit_transform(spam_messages)
#Converting the TF-IDF vectors into array which is the preferred input for kears
tfidf_array = tfidf.toarray()

In [11]:
# Label encoding of the output to convert it to numeric representation
label_encoder = LabelEncoder()
spam_classes = label_encoder.fit_transform(spam_classes_raw)

# Converting the output to one hot encoding vector for keras
spam_classes = tf.keras.utils.to_categorical(spam_classes, 2)

In [12]:
# Splitting the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_array, spam_classes, test_size=0.10)