In [46]:
import pandas as pd
from Utils.misc import batch
from nltk.corpus import gutenberg
from spacy.en import English
from Utils.indexer import build_index
from DataLoader import GloVe
from keras.preprocessing.sequence import pad_sequences
from Utils.generator import random_access
import numpy as np
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import Callback, ModelCheckpoint
from keras.layers import Activation, dot, add, MaxPooling1D, MaxPooling2D, Bidirectional, Input, GRU, LSTM, SimpleRNN, Conv1D, Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, Reshape, Embedding, Concatenate
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.constraints import unit_norm
from keras.initializers import Identity
from keras import backend as K
from keras.engine.topology import Layer
import tensorflow as tf

In [2]:
nlp = English()

In [3]:
quotes = nlp('\n'.join(pd.read_csv('./download/author-quote.txt', header = None, delimiter='\t')[1].as_matrix()))

In [22]:
corpus = nlp(gutenberg.raw())

In [5]:
quotes[0]

If

In [31]:
def words_generator():
    for word in quotes:
        yield word.text.lower()
    for word in corpus:
        yield word.text.lower()

In [32]:
# build index for all words
o2i, i2o, size = build_index(words_generator())
print(size)
print(o2i('a'))
print(o2i('and'))
print(o2i('the'))

60984
5
29
69


In [33]:
WORD_EMB_DIM = 300
glove, orig_glove = GloVe.selective_load('./data/GloVe/glove.6B.{}d.txt'.format(WORD_EMB_DIM), WORD_EMB_DIM, o2i, i2o, size)

Start: Loading Glove Model
End: Loaded 400000 rows.


In [34]:
data = [(sents, 1) for sents in quotes.sents]
data = data + [(sents, 0) for sents in corpus.sents]
print(len(data))

155297


In [35]:
SEQ_LENGTH = 128

In [36]:

gen = {}
size = {}
gen['train'], gen['test'], size['train'], size['test'] = random_access(data)
print(next(gen['train']))
print(next(gen['test']))

Training data size: 124237
Testing data size: 31060
(Raise your voice above a
whisper, and I run you through the body."

, 0)
(What do they expect?, 1)


In [40]:


def sample_generator(gen, batch_size = 64):
    data = []
    label = []
    for row in gen:
        data.append(batch(o2i)(batch(lambda x: x.text.lower())(row[0])))
        label.append([row[1]])
        if len(data) == batch_size:
            yield (pad_sequences(np.array(data), maxlen=SEQ_LENGTH), np.array(label))
            data = []
            label = []

In [43]:
print(next(sample_generator(gen['train'], 2)))
print(next(sample_generator(gen['test'], 2)))

(array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     8,  2232,    65,   555,
         2422,   185,   237,   279,  5304,   116,  2109,  3071,   279,
     

In [47]:
def create_baseline():
    
    GRU_DIM = 512
        
    inp = Input(shape=(SEQ_LENGTH,))
    glove_emb = Embedding(glove.shape[0], glove.shape[1], weights=[glove], input_length=SEQ_LENGTH, trainable=False)(inp)
    
    y = Bidirectional(GRU(GRU_DIM // 2, activation='selu', return_sequences=True))(glove_emb)
    y = Bidirectional(GRU(GRU_DIM // 2, activation='selu'))(y)
    
    predict = Dense(1, activation='sigmoid')(y)
    
    model = Model(inp, predict)
    model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])
    return model

In [48]:
model = create_baseline()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 128, 300)          18295200  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128, 512)          855552    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 512)               1181184   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 513       
Total params: 20,332,449
Trainable params: 2,037,249
Non-trainable params: 18,295,200
_________________________________________________________________


In [49]:
mc = ModelCheckpoint('./model/quotes_model.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
class Metrics(Callback):
    def on_epoch_end(self, batch, logs={}):
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        self.f1s=f1(targ, predict)
        return
metrics = Metrics()

BATCH_SIZE = 32
model.fit_generator(
    sample_generator(gen['train'], BATCH_SIZE),
    size['train'] // BATCH_SIZE,
    validation_data = sample_generator(gen['test'], BATCH_SIZE),
    validation_steps = size['test'] // BATCH_SIZE,
    epochs=200000,
    #callbacks = [mc]
    verbose=0, callbacks=[TQDMNotebookCallback(),mc]
)

KeyboardInterrupt: 