## Notebook 2 :Training

In this notebook, we are going to build a sequence to sequence model(with and without attention) using Keras using the preprocessed data from the earlier notebook.

In [262]:
import pickle
import pandas as pd
import gensim as gs
import pandas as pd
import numpy as np
import scipy as sc
import nltk
import os
from nltk.tokenize import word_tokenize as wt
from nltk.tokenize import sent_tokenize as st
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
import logging
import re
import sys
import random
from collections import Counter
from tensorflow.contrib import keras
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.cross_validation import train_test_split
from keras.layers import Dense,LSTM,Input,Activation,Add,TimeDistributed,\
Permute,Flatten,RepeatVector,merge,Lambda,Multiply,Reshape
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.models import Sequential,Model
from keras.optimizers import RMSprop
from keras import backend as K
import tensorflow as tf

#### Read embeddings and preprocessed data

In [86]:
pickle_path = 'E:\\Spring-19\\Workshop\\saved_models'
FN = 'embeddings.pkl'

with open(os.path.join(pickle_path,FN), 'rb') as fp:
    embedding, idx2word, word2idx, glove_idx2idx = pickle.load(fp)
vocab_size, embedding_size = embedding.shape

FN = 'data.pkl'
with open(os.path.join(pickle_path,FN), 'rb') as fp:
    X, Y = pickle.load(fp)

#### Set Hyperparams. We will visit these parameters when we use them during training.

In [87]:
seed=42
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
LR = 1e-4
batch_size=64

maxlend=25 
maxlenh=25
maxlen = maxlend + maxlenh
batch_norm=False

val_samples = 3000
emb_size = 100
hidden_units= emb_size

#### Create multiple out of vocabulary words.

In [88]:
empty = 0
eos = 1
idx2word[empty] = '_'
idx2word[eos] = '~'

nb_unknown_words = 10
for i in range(nb_unknown_words):
    idx2word[vocab_size-1-i] = '<%d>'%i

oov0 = vocab_size-nb_unknown_words

for i in range(oov0, len(idx2word)):
    idx2word[i] = idx2word[i]+'^'

#### Divide into train and val samples preferably in multiples of batch size.

In [164]:
num_train_batches = len(X) // batch_size
num_val_samples = val_samples + len(X) - batch_size*num_train_batches
num_val_batches = num_val_samples // batch_size
total_entries = (num_train_batches + num_val_batches)*batch_size
X, Y = X[:total_entries], Y[:total_entries]
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size= num_val_samples , random_state=seed)
len(X), len(X_train), len(Y_train), len(X_val), len(Y_val)

(35170, 32136, 32136, 3034, 3034)

### Couple of helper functions

##### Attach EOS and pre-pad  : 
left (pre) pad a description to maxlend and then add eos. The eos is the input to predicting the first word in the summary.

In [263]:
def lpadd(x, maxlend=maxlend, eos=eos):
    assert maxlend >= 0
    if maxlend == 0:
        return [eos]
    n = len(x)
    if n > maxlend:
        x = x[-maxlend:]
        n = maxlend
    return [empty]*(maxlend-n) + x + [eos]

##### Assign unknown words
1. Convert list of word indexes that may contain words outside vocab_size to words inside.
2. If a word is outside, try first to use glove_idx2idx to find a similar word inside.
3. If none exist then replace all accurancies of the same unknown word with <0>, <1>, ...


In [None]:
def vocab_fold(xs):
    xs = [x if x < oov0 else glove_idx2idx.get(x,x) for x in xs]
    # the more popular word is <0> and so on
    outside = sorted([x for x in xs if x >= oov0])
    # if there are more than nb_unknown_words oov words then put them all in nb_unknown_words-1
    outside = dict((x,vocab_size-1-min(i, nb_unknown_words-1)) for i, x in enumerate(outside))
    xs = [outside.get(x,x) for x in xs]
    return xs

#### CREATE DATA BATCH GENERATOR

We do use teacher forcing(https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/) in our model and hence the input sentences to the decoder will be lagging the true sentence to be deocded by one string.

In [267]:
def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, model=None, seed=seed):
    c = nb_batches if nb_batches else 0
    while True:
        xds = []
        xhs = []
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        for b in range(batch_size):
            t = random.randint(0,len(Xd)-1)
            
            #random shuffling of data
            xd = Xd[t]
            s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))
            xds.append(xd[:s])
            
            xh = Xh[t]
            s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))
            xhs.append(xh[:s])

        # undo the seeding before we yield inorder not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(xds, xhs)

def conv_seq_labels(xds, xhs, nflips=None, model=None):
    """description and hedlines are converted to padded input vectors. headlines are one-hot to label"""
    batch_size = len(xhs)
    
    x = [vocab_fold(lpadd(xd)+xh) for xd,xh in zip(xds,xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
        
    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh) + [eos] + [empty]*maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i,:,:] = np_utils.to_categorical(xh, vocab_size)
        
    #The 3 inputs are description, summary starting with eos and a one-hot encoding of the summary categorical variables.
    return [x[:,:maxlend],x[:,maxlend:]], y




##### Some sanity check to see what the processing yields us. See that the summary input into the model is beginning with a '~'trigger always.

In [273]:
r = next(gen(X_train, Y_train, batch_size=batch_size))
print(r[0][0].shape,r[0][1].shape,r[1].shape)
print("Description  : ", [idx2word[k] for k in r[0][0][5]])
print("Summary  : ", [idx2word[k] for k in r[0][1][5]])


(64, 25) (64, 25) (64, 25, 40000)
Description  :  ['of', 'their', 'poppers', 'and', 'popcorn.', 'Shipping', 'was', 'quick', 'and', 'the', 'popcorn', 'was', 'awesome.', 'Love', 'how', 'simple', 'to', 'make', 'the', 'popcorn', 'with', 'all', 'the', '<0>^', 'in']
Summary  :  ['~', 'Happy', 'with', 'their', 'products', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']


In [269]:
train_gen = gen(X_train, Y_train, batch_size=batch_size)
val_gen =  gen(X_val, Y_val, nb_batches=val_samples//batch_size, batch_size=batch_size)

In [174]:
learning_rate = 0.002
clip_norm = 1.0
regularizer = l2(weight_decay) if weight_decay else None

In [243]:
def encoder_decoder(train_gen, val_gen, mode = 'fit', num_epochs=1,en_shape=maxlend,de_shape=maxlenh):
    
    print('Encoder_Decoder LSTM...')
   
    """__encoder___"""
    encoder_inputs = Input(shape=(en_shape,))
    print(encoder_inputs)
    
    #APPLY EMBEDDING LAYER.        
    input_emb = Embedding(vocab_size, embedding_size,
                    input_length=maxlend,
                    W_regularizer=regularizer, dropout=p_emb, weights=[embedding], mask_zero=True,
                    name='embedding_1')
    
    #ENCODER LSTM - FORWARD     
    encoder_LSTM = LSTM(hidden_units, dropout_U = 0.2, dropout_W = 0.2 ,return_state=True)
    encoder_LSTM_rev=LSTM(hidden_units,return_state=True,go_backwards=True)
    
    #ENCODER LSTM - REVERSE
    encoder_outputsR, state_hR, state_cR = encoder_LSTM_rev(input_emb(encoder_inputs))
    encoder_outputs, state_h, state_c = encoder_LSTM(input_emb(encoder_inputs))
        
    state_hfinal=Add()([state_h,state_hR])
    state_cfinal=Add()([state_c,state_cR])
    
    encoder_states = [state_hfinal,state_cfinal]
    
    """____decoder___"""
    #Input to the decoder would be the summary(headline) sequence starting from ~ character.
    decoder_inputs = Input(shape=(de_shape,))
    
    print(decoder_inputs)
      
    decoder_LSTM = LSTM(hidden_units,return_sequences=True,return_state=True)
    decoder_outputs, _, _ = decoder_LSTM(input_emb(decoder_inputs),initial_state=encoder_states) 
    decoder_dense = Dense(de_shape,activation='linear')
    
    # Apply a dense layer that has vocab_size(40000) outputs which learns probability of each word when softmax is applied.
    # TimeDistributed is a wrapper for applying the same function over all the time step outputs. 
    # Refer https://keras.io/layers/wrappers/
    time_distributed = TimeDistributed(Dense(vocab_size,
                                W_regularizer=regularizer, b_regularizer=regularizer,
                                name = 'timedistributed_1'))
    activation = Activation('softmax', name='activation_1')
    decoder_outputs = activation(time_distributed(decoder_outputs))
    
    #Model groups layers into an object with training and inference features.
    #https://www.tensorflow.org/api_docs/python/tf/keras/models/Model        
    model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)
    
    rmsprop = RMSprop(lr=learning_rate,clipnorm=clip_norm)
    
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop)
    
    for epoch in range(num_epochs):
        model.fit_generator(train_gen,
                  steps_per_epoch = num_train_batches,
                  epochs=5,  #Try different epochs as hyperparameter                
                  validation_data = val_gen,
                  validation_steps = num_val_batches
                           )
    
    #_________________________INFERENCE MODE______________________________#  
    
    encoder_model_inf = Model(encoder_inputs,encoder_states)
    
    decoder_state_input_H = Input(shape=(hidden_units,))
    decoder_state_input_C = Input(shape=(hidden_units,)) 
    decoder_state_inputs = [decoder_state_input_H, decoder_state_input_C]
    decoder_outputs, decoder_state_h, decoder_state_c = decoder_LSTM(input_emb(decoder_inputs),
                                                                     initial_state=decoder_state_inputs)
    decoder_states = [decoder_state_h, decoder_state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model_inf= Model([decoder_inputs]+decoder_state_inputs,
                         [decoder_outputs]+decoder_states)
    
    return model,encoder_model_inf,decoder_model_inf

##### Note

Implement multiple layers in LSTM. Right now, we have implemented single layer LSTMs only. For appropriate performance, implement 2/3 layers LSTMS with 128/256 size.

In [244]:
def saveModels(models,names):
    path = 'E:\\Spring-19\\Workshop\\saved_models'
    for i in range(len(names)):
        models[i].save(os.path.join(path,names[i]))    

In [257]:
'''model, encoder, decoder = encoder_decoder(train_gen, val_gen)
saveModels([model,encoder,decoder],["init_model","encoder","decoder"])'''

'model, encoder, decoder = encoder_decoder(train_gen, val_gen)\nsaveModels([model,encoder,decoder],["init_model","encoder","decoder"])'

In [254]:
def encoder_decoder_with_attention(train_gen, val_gen, mode = 'fit', num_epochs=1,en_shape=maxlend,de_shape=maxlenh):
    encoder_inputs = Input(shape=(en_shape,))
    print(encoder_inputs)
        
    input_emb = Embedding(vocab_size, embedding_size,
                    input_length=maxlend,
                    W_regularizer=regularizer, dropout=p_emb, weights=[embedding], mask_zero=False,
                    name='embedding_1')
    
    encoder_LSTM = LSTM(hidden_units,dropout_U=0.2,dropout_W=0.2,return_sequences=True,return_state=True)
    encoder_LSTM_rev=LSTM(hidden_units,return_state=True,return_sequences=True,dropout_U=0.05,dropout_W=0.05,go_backwards=True)
    
    encoder_outputs, state_h, state_c = encoder_LSTM(input_emb(encoder_inputs))
    encoder_outputsR, state_hR, state_cR = encoder_LSTM_rev(input_emb(encoder_inputs))
    
    state_hfinal=Add()([state_h,state_hR])
    state_cfinal=Add()([state_c,state_cR])
    encoder_outputs_final = Add()([encoder_outputs,encoder_outputsR])
    
    encoder_states = [state_hfinal,state_cfinal]
    
    decoder_inputs = Input(shape=(de_shape,))
    print(decoder_inputs)
    decoder_LSTM = LSTM(hidden_units,return_sequences=True,dropout_U=0.2,dropout_W=0.2,return_state=True)
    decoder_outputs, _, _ = decoder_LSTM(input_emb(decoder_inputs),initial_state=encoder_states)
    
    ######################ATTENTION####################################################
    
    attention = TimeDistributed(Dense(1, activation = 'tanh'))(encoder_outputs_final)
    attention = Multiply()([attention,decoder_outputs])
    attention = Activation('softmax')(attention)
    attention = Permute([2, 1])(attention)
    
    ####################################################################################
    time_distributed = TimeDistributed(Dense(vocab_size,
                                W_regularizer=regularizer, b_regularizer=regularizer,
                                name = 'timedistributed_1'))
    activation = Activation('softmax', name='activation_1')
    decoder_outputs = activation(time_distributed(decoder_outputs))
    
    model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)
    
    rmsprop = RMSprop(lr=learning_rate,clipnorm=clip_norm)
    
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop)
    
    for epoch in range(num_epochs):
        model.fit_generator(train_gen,
                  steps_per_epoch = num_train_batches,
                  epochs=1,                  
                  validation_data = val_gen,
                  validation_steps = num_val_batches
                           )
        
    #########################INFERENCE###################################    
    encoder_model_inf = Model(encoder_inputs,encoder_states)
    
    decoder_state_input_H = Input(shape=(en_shape,))
    decoder_state_input_C = Input(shape=(en_shape,)) 
    decoder_state_inputs = [decoder_state_input_H, decoder_state_input_C]
    decoder_outputs, decoder_state_h, decoder_state_c = decoder_LSTM(input_emb(decoder_inputs),
                                                                     initial_state=decoder_state_inputs)
    decoder_states = [decoder_state_h, decoder_state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model_inf= Model([decoder_inputs]+decoder_state_inputs,
                         [decoder_outputs]+decoder_states)
    
    return model, encoder_model_inf, decoder_model_inf


In [258]:
'''model, encoder, decoder = encoder_decoder_with_attention(train_gen,val_gen)
saveModels([model,encoder,decoder],['att_model','att_encoder','att_decoder'])'''

"model, encoder, decoder = encoder_decoder_with_attention(train_gen,val_gen)\nsaveModels([model,encoder,decoder],['att_model','att_encoder','att_decoder'])"

#### HOW TO DECODE FOR INFERENCE ?

There are two standard options to decode while doing inference. While doing inference, remember that you don't have access to ground truth summaries and you have to build your own summaries based on the words that the decoder predicted already. While training, we used teacher forcing where we fed in the ground truth word from the previous time-stamp - not feasible during inference.

1. Be Greedy - Feed the previously predicted word into the decoder as the input for the next time step. Remember we used teacher forcing during training where we used the ground truth label of the previous time step.

2. Use Beam Search of size K - Retain top (in terms of log likelihood) K alternatives whenever you decode each new word and append to existing sequences. Better performing since it allows the model to recover from errors. More costly to run inference.

Example

#### TRUE SUMMARY 
Treat for pet lovers.

#### GREEDY PREDICTIONS

Treat

Treat yourself 

Treat yourself for

Treat yourself for dog

#### BEAM SEARCH (size 4 beam)

Treat, Yummy, Select, Best

Treat for, Yummy flavor, Treat yourself, Yummy !

Treat for dog, Treat for pet, Treat for overweight, Yummy flavor for

Treat for dog callers, Treat for dog owners, Treat for pet diseases, Treat for dog collars

##### Test time inference decoder example.

##### Note for Later : Try writing your own decoder both using greedy search and beam search.

In [None]:
#Assuming article is input as a pre-processed X (padded etc. using earlier functions). Sample code for decoding.
def summarize(article,encoder,decoder):
    summary = []
    article = np.reshape()
    article =  np.reshape(article,(1,en_shape))
    
    encoded_state_val = encoder.predict(article)
    target_seq = np.zeros((1,1,vocab_size))
    
    while len(summary) < de_shape:
        decoder_out,decoder_h,decoder_c= decoder.predict(x=[target_seq]+init_state_val)
        summary.append(decoder_out)
        init_state_val= [decoder_h,decoder_c]
        #get most similar word and put in line to be input in next timestep
        #target_seq=np.reshape(model.wv[getWord(decoder_out)[0]],(1,1,emb_size_all))
        target_seq=np.reshape(decoder_out,(1,1,de_shape[1]))    
    
    return summary    

### SOME SUMMARIZATION EXAMPLES

<small>These are examples collected from the model run on the reviews dataset (Remember we gave you a trimmed version) using multi-layer LSTMS on K80 GPUs. Here we present some results since they make for interesting discussions and viewing.</small>