In [17]:
# https://medium.com/ai-techsystems/spam-text-classification-on-cainvas-c0861db9393b
import pandas as pd
import numpy as np
import nltk


df = pd.read_csv('datasets/spam.csv')

In [18]:
print(df.shape)
df.head(10)

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [19]:
# Distribution of score values
df['Category'].value_counts()

df = df.drop_duplicates()

# Distribution of score values
df['Category'].value_counts()

ham     4516
spam     641
Name: Category, dtype: int64

In [20]:
# Labels as 1 - spam or 0 - ham
df['Category'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)
df.head(10)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [21]:
# Remove html tags
def removeHTML(sentence):
    regex = re.compile('<.*?>')
    return re.sub(regex, ' ', sentence)

# Remove URLs
def removeURL(sentence):
    regex = re.compile('http[s]?://\S+')
    return re.sub(regex, ' ', sentence)

# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
    regex = re.compile('[^a-zA-Z]')
    return re.sub(regex, ' ', sentence)

def removeRecurring(sentence):
    return re.sub(r'(.)\1{2,}', r'\1', sentence)

# Defining stopwords
stop = nltk.corpus.stopwords.words('english')

In [22]:
import re
from nltk.tokenize import RegexpTokenizer

sno = nltk.stem.SnowballStemmer('english')    # Initializing stemmer
spam = []    # All words in positive reviews
ham  = []    # All words in negative reviews
all_sentences = []    # All cleaned sentences


for x in range(len(df['Message'].values)):
    review = df['Message'].values[x]
    rating = df['Category'].values[x]

    cleaned_sentence = []
    sentence = removeURL(review) 
    sentence = removeHTML(sentence)
    sentence = onlyAlphabets(sentence)
    sentence = sentence.lower()   

    sentence = removeRecurring(sentence)  

    for word in sentence.split():
        #if word not in stop:
            stemmed = sno.stem(word)
            cleaned_sentence.append(stemmed)
            
            if rating == 1 :
                spam.append(stemmed)
            else:
                ham.append(stemmed)

    all_sentences.append(' '.join(cleaned_sentence))

# add as column in dataframe
df['Cleaned'] = all_sentences

In [23]:
df.head(10)

Unnamed: 0,Category,Message,Cleaned
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazi avail onli in bugi...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in a wkli comp to win fa cup final ...
3,0,U dun say so early hor... U c already then say...,u dun say so earli hor u c alreadi then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goe to usf he live around...
5,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darl it s been week s now an...
6,0,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...
7,0,As per your request 'Melle Melle (Oru Minnamin...,as per your request mell mell oru minnaminungi...
8,1,WINNER!! As a valued network customer you have...,winner as a valu network custom you have been ...
9,1,Had your mobile 11 months or more? U R entitle...,had your mobil month or more u r entitl to upd...


In [24]:
from sklearn.model_selection import train_test_split
# Splitting into train, val and test set -- 80-10-10 split

# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2, random_state = 113)

# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5, random_state = 113)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range = (1,1), max_features=20000)

train_bow = cv.fit_transform(train_df['Cleaned'])
val_bow = cv.transform(val_df['Cleaned'])
test_bow = cv.transform(test_df['Cleaned'])

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer().fit(train_bow)

train_tf = tfidf.transform(train_bow)
val_tf = tfidf.transform(val_bow)
test_tf = tfidf.transform(test_bow)

In [27]:
Xtrain = train_tf.toarray()
ytrain = train_df['Category']

Xval = val_tf.toarray()
yval = val_df['Category']

ytest = test_df['Category']
Xtest = test_tf.toarray()

In [28]:
from tensorflow import keras
from keras import layers
import tensorflow as tf
from keras.models import Model
from keras.models import Sequential

model = tf.keras.Sequential([
    layers.Dense(16, activation = 'relu', input_shape = Xtrain[0].shape),   
    layers.Dense(4, activation = 'relu'),
    layers.Dense(1, activation = 'sigmoid')
])

cb = [tf.keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]

model.compile(optimizer = tf.keras.optimizers.Adam(0.0001), loss = tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 64, callbacks = cb)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64


In [29]:
model.evaluate(Xtest,ytest)



[0.12064392864704132, 0.9689922332763672]

In [30]:
x = np.random.randint(0, Xtest.shape[0] - 1)

sentence = test_df['Message'].values[x]
print("Sentence: ", sentence)

cleaned_sentence = []
sentence = removeURL(sentence) 
sentence = removeHTML(sentence)
sentence = onlyAlphabets(sentence) 
sentence = sentence.lower() 
sentence = removeRecurring(sentence)

for word in sentence.split():
    #if word not in stop:
        stemmed = sno.stem(word)
        cleaned_sentence.append(stemmed)

sentence = [' '.join(cleaned_sentence)]
print("\nCleaned sentence: ", sentence[0])

sentence = cv.transform(sentence)
sentence = tfidf.transform(sentence)

print("\nTrue value: ", test_df['Category'].values[x])

pred = model.predict(sentence.toarray())[0][0]
print("\nPredicted value: ", int(pred>0.5), "(", pred, "-->", (pred>0.5).astype('int'), ")")

Sentence:  Reverse is cheating. That is not mathematics.

Cleaned sentence:  revers is cheat that is not mathemat

True value:  0

Predicted value:  0 ( 0.011027836 --> 0 )


In [31]:
model.save('models/spam.model.h5')