Import necessary libraries

In [1]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer

import tensorflow as tf
import pandas as pd
import numpy as np

Read the data

In [2]:
data = pd.read_csv('SPAM text message 20170820 - Data.csv')

data.head(20)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


Create a function called "encode" which tokenizes each message to feed in the model and a function "decode" to reverse encode function

In [8]:
MAX_LEN = 150
tokenizer = Tokenizer()
def encode(text, column=True):
    if column:
        words = [w for w in text]
    else:
        words = [text]   
    tokenizer.fit_on_texts(words)
    words_to_sequence = tokenizer.texts_to_sequences(words)
    padded_sequence = tf.keras.utils.pad_sequences(words_to_sequence, maxlen=MAX_LEN)


    return padded_sequence


def decode(sequence):
    sentence = tokenizer.sequences_to_texts(sequence)

    return sentence

In [4]:
data["Category"] = data['Category'].map({'ham' : 0, 'spam': 1})

In [9]:
X = encode(data['Message'])
y = data['Category'].values

#split the data for training and evaluation
from sklearn.model_selection import train_test_split

train_features, test_features, train_label, test_label = train_test_split(X, y, random_state=42, shuffle=True, test_size=.2)


Initialize the Sequential model

In [10]:
VOCAB_SIZE = 10000

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, input_length=MAX_LEN, output_dim=6),
    tf.keras.layers.LSTM(128),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(.2),    
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 6)            60000     
                                                                 
 lstm (LSTM)                 (None, 128)               69120     
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 133,281
Trainable params: 133,281
Non-trai

Model Training

In [11]:
history = model.fit(train_features, train_label, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
result = model.evaluate(test_features, test_label)
print(result)

[0.05086665600538254, 0.9883407950401306]


In [13]:
def predict(text):
    encoded_text = encode(text, False)
    pred = (model.predict(encoded_text) > 0.005).astype("int32")
    pred = pred.flatten()[0]
    
    return "SPAM" if pred == 1 else "HAM"

Test model for prediction

In [14]:
text = "WINNER!! As a valued network customer you have been selected to receivea �900 prize reward! claim"
pred = (model.predict(encode(text, False)) > .005)
print(pred)

[[False]]


In [16]:
text = "Get free service and win a prize just call this number 12103239293"

predict(text)



'SPAM'

In [17]:
text = "hi i would like to ask your name"

predict(text)



'HAM'

In [18]:
text = "Free free, you have a chance to win prize"

predict(text)




'SPAM'

In [19]:
text = 'Hi, what time are you available?'

predict(text)



'HAM'

In [21]:
text = 'can you arrange my clothes tonight, i have some work to do'

predict(text)



'HAM'

In [20]:
text = "Get free service and win a prize just call this number 12103239293"

predict(text)



'SPAM'