In [1]:
# RNN with LSTM for Spam/Ham Email Classification
import pandas as pd
import numpy as np
from keras.models import Model,Sequential
from keras.layers import LSTM,Dense,Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [2]:
# From https://raw.githubusercontent.com/TrainingByPackt/Deep-Learning-for-Natural-Language-Processing/master/Lesson%2007/exercise/spam.csv
from google.colab import files
uploaded = files.upload()

Saving spam.csv.txt to spam.csv.txt


In [3]:
# Colab reads csv as txt for some reason, so do this
df = pd.read_csv('spam.csv.txt',delimiter=',',encoding='latin')
df.to_csv('spam.csv',index = None) 
df = pd.read_csv('spam.csv',delimiter=',')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Filter needed columns
df = df[['v1','v2']]
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [5]:
# Map data to 0 and 1
mapping = {'ham':0,'spam':1}
Y = df['v1'].map(mapping).values
X = df['v2'].values

In [6]:
# Restrict number of tokens for 100 most frequent words
max_words = 100
tokeniser = Tokenizer(num_words=max_words,lower=True,split=' ')
tokeniser.fit_on_texts(X)
text_tokenised = tokeniser.texts_to_sequences(X)
# Note: Only words within 100 most popular are given index. 1st line has 6 indices
text_tokenised[:3]

[[50, 64, 8, 89, 67, 58], [46, 6], [47, 8, 19, 4, 2, 71, 2, 2, 73]]

In [7]:
# Max sequence length of 50 words per sequence + pad sequences that are shorter
max_len = 50
sequences = sequence.pad_sequences(text_tokenised, maxlen=max_len)

In [9]:
# Define model with LSTM (64 hidden units) + fit to sequence data
model = Sequential()
# Set input vectors to fixed size (20 passed to LSTM layer)
model.add(Embedding(max_words,20,input_length=max_len))
model.add(LSTM(64))
# 1 value = target variable is 0 or 1
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(sequences,Y,batch_size=128,epochs=10,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f78ff23f630>

In [10]:
# Test Spam
test_email = 'WINNER! U win a 700 reward & free FA CUp final tickets! Text FA to 34212 now!'
test_sequences = tokeniser.texts_to_sequences(np.array([test_email]))
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)
model.predict(test_sequences_matrix)

array([[0.9360942]], dtype=float32)

In [11]:
# Test Ham
test_email = 'Hello, are you free for Golf tomorrow?'
test_sequences = tokeniser.texts_to_sequences(np.array([test_email]))
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)
model.predict(test_sequences_matrix)

array([[0.27068532]], dtype=float32)