In this notebok, we addapt the text categorization problem outlined in the [keras tutorial documentation](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html).  We begin by simply changing the data source to be the input data to the data from the kaggle contest [Spooky Author Identification](https://www.kaggle.com/c/spooky-author-identification/kernels).

In [1]:
# Definitions

from __future__ import print_function

import os
import sys
import numpy as np

# tensorflow settings to activate gpu
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.optimizers import RMSprop

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


BASE_DIR = '../data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'SpookyData')
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

import tensorflow as tf
# Creates a graph.
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6727931944819527565
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10904823399
locality {
  bus_id: 1
}
incarnation: 368414887088364895
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]
[[ 22.  28.]
 [ 49.  64.]]


We need to get the data into a form where we can use it for training and prediction.  For that we use the pandas library.

In [2]:
import pandas as pd

# read the training data
df = pd.read_csv(os.path.join(TEXT_DATA_DIR, 'train.csv'))
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# get a list of classifications and generate numeric 
#  values for each class.  put the numeric class back 
#  on to the data frame.
authors = dict([(auth, idx) for idx, auth in enumerate(df['author'].unique())])
print(authors)
df['author_id'] = df['author'].apply(lambda x: authors[x])

df.head()

{'EAP': 0, 'MWS': 2, 'HPL': 1}


Unnamed: 0,id,text,author,author_id
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


In [4]:
# Drop stop words
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

# now we will use the text and author_id fields to train a classifier.
#  We have to: 
#  1. Get the sentences, 
sents = df['text'].tolist()
labels = df['author_id'].tolist()
#  2. Tokenize each sentence, 
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(sents)
sequences = tokenizer.texts_to_sequences(sents)
print(len(sequences))
print(sequences[0])
##    Get a vector of unique terms here
print('Found %s unique tokens before stopwords removal.' % len(tokenizer.word_index))
print([w for w in tokenizer.word_index.items()][:5])
word_index = dict([(w,i) for w,i in tokenizer.word_index.items() if w not in stops])
print('Found %s unique tokens after stopwords removal.' % len(word_index))


data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
y_val[:5]

19579
[26, 2945, 143, 1372, 22, 36, 294, 2, 7451, 1, 2440, 2, 10, 4556, 16, 6, 79, 179, 48, 4245, 3, 295, 4, 1, 249, 1943, 6, 326, 74, 134, 123, 891, 2, 1, 313, 39, 1438, 4928, 98, 1, 430]
Found 25943 unique tokens before stopwords removal.
[('superhuman', 7725), ('ripples', 7813), ('singleness', 12350), ('dig', 8374), ('reasons', 2213)]
Found 25808 unique tokens after stopwords removal.
Shape of data tensor: (19579, 1000)
Shape of label tensor: (19579, 3)


array([[ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [5]:
#  3. Load embeddings
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [6]:
#  4. Create the Embedding matrix for the training set
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
unk = []
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        unk.append(word)
print(len(unk))

2092


In [7]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
#x = MaxPooling1D()(x)
#x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(.5)(x)
preds = Dense(len(authors), activation='softmax')(x)
rms = RMSprop(lr=0.003)
model = Model(sequence_input, preds)
model.compile(loss='mean_squared_logarithmic_error',
              optimizer=rms, #'rmsprop',
              metrics=['acc'])
#model.compile(loss='categorical_crossentropy',
#              optimizer=rms, #'rmsprop',
#              metrics=['acc'])

Training model.


In [8]:
model.fit(x_train, y_train,
          batch_size=100,
          epochs=50,
          validation_data=(x_val, y_val))

Train on 15664 samples, validate on 3915 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f7c954ff8d0>

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         6000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          192128    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total para