In [0]:
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from collections import Counter
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils import plot_model
from IPython.display import Image

import keras
import nltk
nltk.download('punkt')
import numpy
import sklearn
!pip install -q pydot


#Load  the glove twitter zip file
#! curl -O http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip 


RAND_STATE=np.random.seed(42)
BATCH_SIZE = 32
NUM_EPOCHS = 10 
GLOVE_EMBEDDING_SIZE = 100
HIDDEN_UNITS = 256
MAX_INPUT_SEQ_LENGTH = 40
MAX_TARGET_SEQ_LENGTH = 40
MAX_VOCAB_SIZE = 10000



DATA_SET_NAME = '/content/drive/My Drive/Colab Notebooks/cornell'
DATA_PATH = '/content/drive/My Drive/Colab Notebooks/cornell/movie_lines_cleaned.txt'
GLOVE_MODEL = "/content/drive/My Drive/Colab Notebooks/cornell/glovedataset/glove.twitter.27B.100d.txt"
WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,'
WEIGHT_FILE_PATH =  DATA_SET_NAME + '/word-glove-weights.h5'


'''Check if the characters in the words are whitelisted'''
def in_white_list(_word):
  for char in _word:
        if char in WHITELIST:
            return True
        else:
          return False

#Prepare input data with embedding
def load_glove_vector():
    _word2embedding = {}
    file = open(GLOVE_MODEL, mode='rt', encoding='utf8')
    for line in file:
        words = line.strip().split()
        word = words[0]
        embeds = np.array(words[1:], dtype=np.float32)
        _word2embedding[word] = embeds
    file.close()
    return _word2embedding

word2embedding = load_glove_vector()     
print(WHITELIST)          

print(str(len(word2embedding)))

assert len(word2embedding.keys())==1193514
for key in word2embedding.keys():
    try:
        assert len(word2embedding[key])==100
        #print("A")
    except AssertionError:
        print (key,len(word2embedding[key]))
        #print("B") 

input_counter = Counter()
target_counter = Counter()



# read the data
with open(DATA_PATH, 'r', encoding="utf8") as f:
    df = f.read()
print(df[:100])

rows = df.split('\n')
#lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n')
lines = [row.split(' +++$+++ ')[-1] for row in rows]
print(df[:100])
input_texts = []
target_texts = []
prev_words = []



for line in lines:
    next_words = [w.lower() for w in nltk.word_tokenize(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1
        target_words = next_words[:]
        target_words.insert(0, 'start')
        target_words.append('end')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)
    prev_words = next_words

for idx, (input_words, target_words) in enumerate(zip(input_texts, target_texts)):
    if idx > 10:
        break
    print([input_words, target_words])   


#'''create a target word to id dictionary called target_word2idx.
#'''create a target to id dictionary called input_word2idx . Approx ~1 line'''

# encode the data
input_word2idx = dict()
target_word2idx = dict()


if 'unk' not in target_word2idx:
    target_word2idx['unk'] = 0


input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1


input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)

np.save( DATA_SET_NAME + '/word-glove-target-word2idx.npy', target_word2idx)
np.save( DATA_SET_NAME + '/word-glove-target-idx2word.npy', input_word2idx)


print(num_encoder_tokens)


encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        w2idx = 1
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)


context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

np.save( DATA_SET_NAME + '/word-context.npy', context)

for input_text,input_text_embed in zip (input_texts,range(len(encoder_input_data))):
    assert (len(input_text)==len(encoder_input_data[input_text_embed]))

# custom function to generate batches

def generate_batch(input_data, output_text_data):
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch


# Compiling and training

encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, name='encoder_embedding')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

#Model Architechture

plot_model(model, to_file='model.png')

Image(filename='model.png',height=400,width=400)
#print(Image)


model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

json = model.to_json()
open(DATA_SET_NAME + 'word-architecture.json', 'w').write(json)

#Create Train and Test data
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, target_texts, test_size=0.2, random_state=42)

train_gen = generate_batch(X_train, y_train)
test_gen = generate_batch(X_test, y_test)

train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE


model.fit_generator(generator=train_gen,
                    steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=test_gen,
                    validation_steps=test_num_batches)

model.save_weights(WEIGHT_FILE_PATH)








[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
abcdefghijklmnopqrstuvwxyz1234567890?.,
1193514
-0.32053 99
They do not!
They do to!
I hope so.
She okay?
Let's go.
Wow
Okay -- you're gonna need to learn how t
They do not!
They do to!
I hope so.
She okay?
Let's go.
Wow
Okay -- you're gonna need to learn how t
[['they', 'do', 'not', '!'], ['start', 'they', 'do', 'to', '!', 'end']]
[['they', 'do', 'to', '!'], ['start', 'i', 'hope', 'so', '.', 'end']]
[['i', 'hope', 'so', '.'], ['start', 'she', 'okay', '?', 'end']]
[['she', 'okay', '?'], ['start', 'let', "'s", 'go', '.', 'end']]
[['let', "'s", 'go', '.'], ['start', 'wow', 'end']]
[['wow'], ['start', 'okay', '--', 'you', "'re", 'gon', 'na', 'need', 'to', 'learn', 'how', 'to', 'lie', '.', 'end']]
[['okay', '--', 'you', "'re", 'gon', 'na', 'need', 'to', 'learn', 'how', 'to', 'lie', '.'], ['start', 'no', 'end']]
[['no'], ['start', 'i', "'m", 'kidding', '.', 'you', 'know', 'how', 's

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
