## Keras LSTM

### Read data:

In [2]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
messages.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
messages.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [5]:
messages['length'] = messages['message'].apply(len)
messages.head()

Unnamed: 0,label,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


Train-test:

In [7]:
from sklearn.cross_validation import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2)

print('Train_size:', len(msg_train), 'Test_size:', len(msg_test))

Train_size: 4457 Test_size: 1115


### Data Prep:

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [8]:
vocabulary_size = 10000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(msg_train)
sequences = tokenizer.texts_to_sequences(msg_train)
test_sequence_ = tokenizer.texts_to_sequences(msg_test)
data = pad_sequences(sequences, maxlen=50)
data_test = pad_sequences(test_sequence_, maxlen=50)

#### Simple LSTM:

In [9]:
model = Sequential()
model.add(Embedding(10000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
model.fit(x=data, y=(label_train == 'ham')*1, validation_data=(data_test, (label_test == 'ham')*1), epochs = 3)

Train on 4457 samples, validate on 1115 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1eef7789f60>

In [11]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(data_test)
roc_auc_score((label_test == 'ham')*1, y_pred)

0.9938514680483592

### Conv+LSTM:

In [12]:
model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(LSTM(100))
model_conv.add(Dense(1, activation='sigmoid'))
model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model_conv.fit(x=data, y=(label_train == 'ham')*1, validation_data=(data_test, (label_test == 'ham')*1), epochs = 3)

Train on 4457 samples, validate on 1115 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1eefbd87c18>

In [14]:
from sklearn.metrics import roc_auc_score

y_pred = model_conv.predict_proba(data_test)
roc_auc_score((label_test == 'ham')*1, y_pred)

0.9913298791018998