In [None]:
import nltk
import inflect
import contractions
import re 
import string 
import unicodedata
import joblib
import warnings
import pickle

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
warnings.filterwarnings("ignore")

In [None]:
nltk.download('wordnet')
nltk.download('stopword')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Guhan.B\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Guhan.B\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def removeNoise(text):
    parser = BeautifulSoup(text, "html.parser")
    text = parser.get_text()
    text = contractions.fix(text)
    return text

def tokenize(text):
    return nltk.word_tokenize(text)

def removeNonaASCII(words):
    newWords = []
    for word in words:
        newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        newWords.append(newWord)
    return newWords

def toLowercase(words):
    newWords = []
    for word in words:
        newWord = word.lower()
        newWords.append(newWord)
    return newWords

def removePunctuation(words):
    newWords = []
    for word in words:
        newWord = re.sub(r'[^\w\s]', '', word)
        if newWord != '':
            newWords.append(newWord)
    return newWords

def replaceNumbers(words):
    engine = inflect.engine()
    newWords = []
    for word in words:
        if word.isdigit():
            newWord = engine.number_to_words(word)
            newWords.append(newWord)
        else:
            newWords.append(word)
    return newWords

def removeStopwords(words):
    newWords = []
    for word in words:
        if word not in stopwords.words('english'):
            newWords.append(word)
    return newWords

def stemWords(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatizeVerbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalizeText(words):
    words = removeNonaASCII(words)
    words = toLowercase(words)
    words = removePunctuation(words)
    words = removeStopwords(words)
    words = stemWords(words)
    words = lemmatizeVerbs(words)
    return words

def processText(text):
    text = removeNoise(text)
    text = ' '.join(normalizeText(tokenize(text)))
    return text

In [None]:
data = pd.read_csv("./dataset/spam_train.csv")

print("Number of rows in data:", data.shape[0])
print("Number of columns in data:", data.shape[1])

data.head()

Number of rows in data: 159571
Number of columns in data: 7


Unnamed: 0,Message,SC_1,SC_2,SC_3,SC_4,SC_5,SC_6
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
# Label => 0 = HAM, 1 = SPAM

categories = ["SC_1", "SC_2", "SC_3", "SC_4", "SC_5", "SC_6"]
data["Label"] = 0

for category in categories:
    data["Label"] += data[category]

data.head()

Unnamed: 0,Message,SC_1,SC_2,SC_3,SC_4,SC_5,SC_6,Label
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [None]:
ham_rows  = data[data["Label"] == 0]
spam_rows = data[data["Label"] == 1]

drop_indices = np.random.choice(ham_rows.index, ham_rows.shape[0] - spam_rows.shape[0] , replace=False)

ham_rows = ham_rows.drop(drop_indices)

data_balanced = pd.concat([ham_rows, spam_rows], axis=0)
data_balanced  = data_balanced.sample(frac=1).reset_index()

In [None]:
print("Number of rows in data:", data_balanced.shape[0])
print("Number of columns in data:", data_balanced.shape[1])

data_balanced.head()

Number of rows in data: 12720
Number of columns in data: 9


Unnamed: 0,index,Message,SC_1,SC_2,SC_3,SC_4,SC_5,SC_6,Label
0,152385,There's that Liberal self-righteousness I knew...,1,0,0,0,0,0,1
1,71647,"In your recent edit, you say that you are a re...",0,0,0,0,0,0,0
2,38875,"Up yours, you authoritarian little Hitlers.",1,0,0,0,0,0,1
3,120659,No that is not the objection. Nor is that comm...,0,0,0,0,0,0,0
4,55556,"needs categories \n\nLike the header says, thi...",0,0,0,0,0,0,0


In [None]:
data_balanced["Message"].apply(processText)
data_balanced.head()

Unnamed: 0,index,Message,SC_1,SC_2,SC_3,SC_4,SC_5,SC_6,Label
0,152385,There's that Liberal self-righteousness I knew...,1,0,0,0,0,0,1
1,71647,"In your recent edit, you say that you are a re...",0,0,0,0,0,0,0
2,38875,"Up yours, you authoritarian little Hitlers.",1,0,0,0,0,0,1
3,120659,No that is not the objection. Nor is that comm...,0,0,0,0,0,0,0
4,55556,"needs categories \n\nLike the header says, thi...",0,0,0,0,0,0,0


In [None]:
data_balanced.to_csv("./dataset/spam_train_balanced.csv")

In [None]:
vectorizer = TfidfVectorizer(
    encoding="utf-8", 
    strip_accents="unicode", 
    stop_words="english", 
    lowercase=True, 
    max_features=10000
)
tfidf_result = vectorizer.fit_transform(data_balanced["Message"])
tfidf_result.shape

(12720, 10000)

In [None]:
pickle.dump(vectorizer, open('./models/message_vectorizer.pkl', 'wb'))

In [None]:
SDC_X = tfidf_result.toarray()
SDC_Y = data_balanced["Label"].values

np.save("./dataset/SDC_X.npy", SDC_X)
np.save("./dataset/SDC_Y.npy", SDC_Y)

In [None]:
embeddings = {}
glove = open("./models/glove.6B.50d.txt", encoding="utf8")
for line in glove:
    values = line.split()
    word = values[0]
    try:
        C = np.asarray(values[1:], dtype='float32')
    except:
        pass
    embeddings[word] = C  
glove.close()
pickle.dump(embeddings, open('./dataset/glove_embeddings.pkl', 'wb'))
print(f'{len(embeddings)} Word vectors')

400000 Word vectors


In [None]:
tokenizer = Tokenizer(num_words=75000)
tokenizer.fit_on_texts(data_balanced["Message"])
sequences = tokenizer.texts_to_sequences(data_balanced["Message"])
paddedSequence = pad_sequences(sequences, maxlen=500)
wordIndex = tokenizer.word_index
print(f'{len(wordIndex)} Unique tokens')
pickle.dump(tokenizer, open('./models/message_tokenizer.pkl', 'wb'))

41866 Unique tokens


In [None]:
SCC_X = paddedSequence
SCC_Y = data_balanced[["SC_1", "SC_2", "SC_3", "SC_4", "SC_5", "SC_6"]].values

np.save("./dataset/SCC_X.npy", SCC_X)
np.save("./dataset/SCC_Y.npy", SCC_Y)