In [161]:
import tensorflow as tf
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3333)
#sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

import os
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"]='2'

import keras
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dense, Lambda, Flatten, Dropout
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import Adam, sgd
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.regularizers import l1, l2
from keras.preprocessing import image, sequence

import pandas as pd
import numpy as np
import _pickle as pickle
import bcolz

#if 'session' in locals() and session is not None:
#    print('Close interactive session')
#    session.close()

## Get IMDB reviews with labels, prepare for our model

In [24]:
#get imdb dataset and sort the word index list by index/rank (already ranked for us!)
from keras.datasets import imdb
idx = imdb.get_word_index()
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [105]:
#import reviews directly as Jeremy doesn't like what Keras does with it when it imports it
from keras.utils.data_utils import get_file
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

1. Build map from id to word
3. Check out first review in id format and word format
4. Check out training labels
5. Reduce vocab size to 5000 most common words (replace rest with 5000th)
6. Check out the distribution of word length
7. Pad shorter reviews (keras - sequence.pad_sequences) with zeros or truncate longer reviews to bring all to 500 words long
8. Training shape should now be (25000, 500)

In [49]:
idx2word = {idx[k]: k for k in idx.keys()}

In [61]:
", ".join([str(idx) for idx in x_train[0]])

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [55]:
" ".join([idx2word[idx] for idx in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [62]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [63]:
len(idx) #number of unique words

88584

In [112]:
vocab_size = 5000
trn = [np.array(x) for x in x_train]
for i in range(len(trn)): trn[i][trn[i]>vocab_size] = vocab_size
test = [np.array(x) for x in x_test]
for i in range(len(test)): test[i][test[i]>vocab_size] = vocab_size

In [113]:
lens = np.array([len(x) for x in trn])
(lens.max(), lens.min(), lens.mean()) #review length stats

(2493, 10, 237.71364)

In [114]:
seq_len = 500
trn = sequence.pad_sequences(trn, maxlen=seq_len)
test = sequence.pad_sequences(test, maxlen=seq_len)

In [115]:
(trn.shape, test.shape)

((25000, 500), (25000, 500))

## The "simple" model!

First build a simple single hidden layer NN
1. create a sequential model with layers: embedding, flatten, dense(100, relu), dropout(0.7), dense(1, sigmoid)
2. compile model (binary crossentropy loss and adam optimizer with accuracy metric) and check summary
3. fit model with batch size 64 and 2 epochs

In [138]:
latent_factors = 32
model = Sequential([Embedding(input_dim=vocab_size, output_dim=latent_factors, input_length=seq_len),
                   Flatten(),
                   Dense(output_dim=100, activation="relu"),
                   Dropout(0.9), #Model wants to overfit data after only 1 epoch so bumped this up with similar accuracy resulting
                   Dense(output_dim=1, activation="sigmoid")])

In [139]:
model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 500, 32)       160000      embedding_input_7[0][0]          
____________________________________________________________________________________________________
flatten_8 (Flatten)              (None, 16000)         0           embedding_8[0][0]                
____________________________________________________________________________________________________
dense_15 (Dense)                 (None, 100)           1600100     flatten_8[0][0]                  
____________________________________________________________________________________________________
dropout_8 (Dropout)              (None, 100)           0           dense_15[0][0]                   
___________________________________________________________________________________________

In [141]:
model.fit(trn, labels_train, batch_size=64, nb_epoch=2, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x22ec3b9ca90>

## Convolutional NN!

Implement a 1D CNN
1. Sequential model of: embedding, dropout(0.2), conv1d(64,5), dropout(0.2), maxpooling1d, flatten, dense(100), dropout(0.7), dense(1)
2. Compile (same as before)
3. fit (4 epochs)

Dropout in Embedding removes some of the latent factors, dropout afterwards removes some of the words

In [153]:
conv1 = Sequential([Embedding(input_dim=vocab_size, output_dim=latent_factors, input_length=seq_len, dropout=0.2),
                   Dropout(0.3),
                   Convolution1D(nb_filter=64, filter_length=5, border_mode="valid", activation="relu"),
                   Dropout(0.3),
                   MaxPooling1D(),
                   Flatten(),
                   Dense(100, activation="relu"),
                   Dropout(0.9),
                   Dense(1, activation="sigmoid")])

In [154]:
conv1.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
conv1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_11 (Embedding)         (None, 500, 32)       160000      embedding_input_10[0][0]         
____________________________________________________________________________________________________
dropout_14 (Dropout)             (None, 500, 32)       0           embedding_11[0][0]               
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 496, 64)       10304       dropout_14[0][0]                 
____________________________________________________________________________________________________
dropout_15 (Dropout)             (None, 496, 64)       0           convolution1d_3[0][0]            
___________________________________________________________________________________________

In [155]:
conv1.fit(trn, labels_train, batch_size=64, nb_epoch=4, validation_data=(test, labels_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x22f25518ac8>

In [156]:
conv1.optimizer.lr = 0.001

In [157]:
conv1.fit(trn, labels_train, batch_size=64, nb_epoch=4, validation_data=(test, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x22f29841198>

## Using pre-trained vectors

Now we use pre-trained embedding values from glove with the previous CNN, and BOOM, accuracy increase!
1. get the weights
2. unpack weights
3. glove uses different idx2word dict than imdb, so need to match 

In [166]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (bcolz.open(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [167]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file D:\jtownend\fast.ai\data\glove\results already exists.
Error occurred while processing: D:\jtownend\fast.ai\data\glove\results.


Untaring file...


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)

In [169]:
vecs = bcolz.open(get_glove_dataset('6B.50d')+'.dat')

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file D:\jtownend\fast.ai\data\glove\results already exists.
Error occurred while processing: D:\jtownend\fast.ai\data\glove\results.


Untaring file...


In [187]:
words = pickle.load(open(get_glove_dataset('6B.50d')+'_words.pkl', encoding='bytes'))

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file D:\jtownend\fast.ai\data\glove\results already exists.
Error occurred while processing: D:\jtownend\fast.ai\data\glove\results.


Untaring file...


LookupError: unknown encoding: bytes

In [171]:
wordidx = pickle.load(open(get_glove_dataset('6B.50d')+'_idx.pkl','rb'))

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file D:\jtownend\fast.ai\data\glove\results already exists.
Error occurred while processing: D:\jtownend\fast.ai\data\glove\results.


Untaring file...


UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 3: ordinal not in range(128)