In [1]:
!pip install stop_words
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers import LSTM, Activation, Dense, Permute, Dropout, add, dot, concatenate, Bidirectional, GRU
import tarfile
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences

from functools import reduce
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
import pandas as pd
from stop_words import get_stop_words
from unicodedata import category
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
!unzip '/content/715041_1245709_bundle_archive.zip'

Collecting stop_words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-cp37-none-any.whl size=32913 sha256=444e332e536ffc475a6df704b0080825865602acde07be28ab9ca54c9102a9da
  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Archive:  /content/715041_1245709_bundle_archive.zip
  inflating: dialogs.txt             


In [2]:
df = pd.read_table('/content/dialogs.txt')

df

Unnamed: 0,"hi, how are you doing?",i'm fine. how about yourself?
0,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
1,i'm pretty good. thanks for asking.,no problem. so how have you been?
2,no problem. so how have you been?,i've been great. what about you?
3,i've been great. what about you?,i've been good. i'm in school right now.
4,i've been good. i'm in school right now.,what school do you go to?
...,...,...
3719,that's a good question. maybe it's not old age.,are you right-handed?
3720,are you right-handed?,yes. all my life.
3721,yes. all my life.,you're wearing out your right hand. stop using...
3722,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [3]:
questions = df['hi, how are you doing?']
answers = df["i'm fine. how about yourself?"]

def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
#     text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text


clean_sentences = []
for item in questions:
  clean_sentence = clean_text(item)
  clean_sentences.append(clean_sentence)

clean_sentences[0]
df

Unnamed: 0,"hi, how are you doing?",i'm fine. how about yourself?
0,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
1,i'm pretty good. thanks for asking.,no problem. so how have you been?
2,no problem. so how have you been?,i've been great. what about you?
3,i've been great. what about you?,i've been good. i'm in school right now.
4,i've been good. i'm in school right now.,what school do you go to?
...,...,...
3719,that's a good question. maybe it's not old age.,are you right-handed?
3720,are you right-handed?,yes. all my life.
3721,yes. all my life.,you're wearing out your right hand. stop using...
3722,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [4]:
train_sentences = clean_sentences[:7000]
validation_sentences = clean_sentences[7000:]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
tokenizer.fit_on_texts(validation_sentences)

train_sequences = tokenizer.texts_to_sequences(
    train_sentences
)
validation_sequences = tokenizer.texts_to_sequences(
    validation_sentences
)

train_padded = pad_sequences(train_sequences, maxlen=120, padding='pre', truncating='pre')
validation_padded = pad_sequences(validation_sequences, maxlen=120, padding='pre', truncating='pre')

vocab_size = len(tokenizer.word_index)
print(vocab_size)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

y_train = []
y_validate = []
for item in range(3724):
  y_train.append(item)
for item in range(199):
  y_validate.append(item)

print(len(y_train))
print(len(y_validate))

2359
3724
199


In [5]:
model = Sequential([
    Embedding(vocab_size+1, 50),
    GRU(256, return_sequences=True),
    GRU(512, return_sequences=False),
    Dense(100, activation='relu'),
    Dense(vocab_size, activation='softmax')
])
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_padded, np.array(y_train), batch_size=32, epochs=100)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          118000    
_________________________________________________________________
gru (GRU)                    (None, None, 256)         236544    
_________________________________________________________________
gru_1 (GRU)                  (None, 512)               1182720   
_________________________________________________________________
dense (Dense)                (None, 100)               51300     
_________________________________________________________________
dense_1 (Dense)              (None, 2359)              238259    
Total params: 1,826,823
Trainable params: 1,826,823
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoc

<keras.callbacks.History at 0x7f89a0295550>

In [8]:
def gen(model, seq, max_len = 20):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [12]:
gen(model, "Hi", max_len=20)

'hi i i i i i i i i i i i i i i i i i i i i'