In [132]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

from keras.layers import SimpleRNN, Embedding, Dense, LSTM, Activation, Flatten, Dropout
from keras.models import Sequential
#import matplotlib.pyplot as plt


import numpy as np
from keras.utils import to_categorical
import os

In [94]:
embeddings_index = {} # We create a dictionary of word -> embedding
f = open('./glove.6B.100d.txt', encoding="utf8") # Open file

# In the dataset, each line represents a new word embedding
# The line starts with the word and the embedding values follow
for line in f:
    values = line.split()
    word = values[0] # The first value is the word, the rest are the values of the embedding
    embedding = np.asarray(values[1:], dtype='float32') # Load embedding
    
    = embedding # Add embedding to our embedding dictionary
f.close()

In [95]:
data = pd.read_csv("./SPAMData.csv")
texts = []
labels = []
for i, label in enumerate(data['Category']):
    texts.append(data['Message'][i])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)

texts = np.asarray(texts)
labels = np.asarray(labels)

In [96]:
vocab_size = 100000

tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts) # Generate sequences

In [97]:
inv_index = {v: k for k, v in tokenizer.word_index.items()}

max_length = 100
data = pad_sequences(sequences, maxlen=max_length)

In [98]:
labels = to_categorical(np.asarray(labels))
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)


Shape of data: (5572, 100)
Shape of labels: (5572, 2)


In [99]:
# Create a matrix of all embeddings
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean() # Calculate mean
emb_std = all_embs.std() # Calculate standard deviation



embedding_dim = 100

word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index)) # How many words are there actually

# Create a random matrix with the same mean and std as the embeddings
embedding_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, embedding_dim))

# The vectors need to be in the same position as their index. 
# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

#print(len(word_index))

# Loop over all words in the word index
for word, i in word_index.items():
    # If we are above the amount of words we want to use we do nothing
    if i > vocab_size: 
        continue
    # Get the embedding vector for the word
    #print(i)
    embedding_vector = embeddings_index.get(word)
    # If there is an embedding vector, put it in the embedding matrix
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [136]:
model = Sequential()
model.add(Embedding(vocab_size, 
                    embedding_dim, 
                    input_length=max_length, 
                    weights = [embedding_matrix], 
                    trainable = False))

model.add(LSTM(128,return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 100)          10000000  
_________________________________________________________________
lstm_14 (LSTM)               (None, 100, 128)          117248    
_________________________________________________________________
lstm_15 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 130       
_________________________________________________________________
activation_11 (Activation)   (None, 2)                 0         
Total params: 10,166,786
Trainable params: 166,786
Non-trainable params: 10,000,000
__________________________________________________________

In [137]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

model.fit(data,labels,validation_split=0.2,epochs=2)

Train on 4457 samples, validate on 1115 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2187e1275c0>

In [179]:
guess = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
guess = np.asarray(guess[:100])
#guess = tokenizer.fit_on_texts(guess)
guess = tokenizer.texts_to_sequences(guess)
#guess = pad_sequences(guess, maxlen=100)
#guess = tokenizer.texts_to_sequences_generator

#print(guess)
print(data[8])


predict = model.predict(guess)
print(predict)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  719   72    4  842  440  236    3   17  109  441
    2 2998 1330  154  962    2  129   16 2999  129  414 3000  516  963
  581   65]
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0  689   73    4  803  434  234    3   17  113  435
     2 2341 1208  154  907    2  130   16 2342  130  396 23