In [1]:
import json
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Dropout, Embedding

## Loading dataset and preprocessing

In [2]:
def load_data(json_file):
    f = open(json_file)
    data = json.load(f)
    
    texts = []
    for i in range(len(data['stories'])):
        text = data['stories'][i]['story']
        text.append('|')
        texts += text

    texts = ' '.join(texts).strip()
    return texts

In [3]:
filename = '../input/aesop-fables-dataset/Aesop Fables.json'
texts = load_data(filename)
texts

"There was once a little Kid whose growing horns made him think he was a grown-up Billy Goat and able to take care of himself. So one evening when the flock started home from the pasture and his mother called, the Kid paid no heed and kept right on nibbling the tender grass. A little later when he lifted his head, the flock was gone. He was all alone. The sun was sinking. Long shadows came creeping over the ground. A chilly little wind came creeping with them making scary noises in the grass. The Kid shivered as he thought of the terrible Wolf. Then he started wildly over the field, bleating for his mother. But not half-way, near a clump of trees, there was the Wolf! The Kid knew there was little hope for him. Please, Mr. Wolf, he said trembling, I know you are going to eat me. But first please pipe me a tune, for I want to dance and be merry as long as I can. The Wolf liked the idea of a little music before eating, so he struck up a merry tune and the Kid leaped and frisked gaily. Mea

## Converting all words to lowercase

In [4]:
def preprocess(texts):
    # Convert all words into lowercase
    texts = texts.lower()
    # Separating punctuations
    texts = word_tokenize(texts)  
    # Punctuations are thought as a word (written with space)
    texts = ' '.join(texts)  
    return texts

In [5]:
texts = preprocess(texts)
texts

"there was once a little kid whose growing horns made him think he was a grown-up billy goat and able to take care of himself . so one evening when the flock started home from the pasture and his mother called , the kid paid no heed and kept right on nibbling the tender grass . a little later when he lifted his head , the flock was gone . he was all alone . the sun was sinking . long shadows came creeping over the ground . a chilly little wind came creeping with them making scary noises in the grass . the kid shivered as he thought of the terrible wolf . then he started wildly over the field , bleating for his mother . but not half-way , near a clump of trees , there was the wolf ! the kid knew there was little hope for him . please , mr. wolf , he said trembling , i know you are going to eat me . but first please pipe me a tune , for i want to dance and be merry as long as i can . the wolf liked the idea of a little music before eating , so he struck up a merry tune and the kid leaped

## Converting all words to integer

In [6]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([texts])
tokenized_texts = tokenizer.texts_to_sequences([texts])

In [7]:
tokenized_texts

[[48,
  11,
  102,
  4,
  93,
  221,
  450,
  1134,
  257,
  79,
  24,
  172,
  7,
  11,
  4,
  1676,
  1677,
  167,
  5,
  349,
  6,
  125,
  280,
  8,
  86,
  3,
  26,
  28,
  222,
  32,
  1,
  237,
  401,
  126,
  44,
  1,
  281,
  5,
  9,
  152,
  186,
  2,
  1,
  221,
  856,
  63,
  1135,
  5,
  223,
  224,
  29,
  857,
  1,
  585,
  503,
  3,
  4,
  93,
  350,
  32,
  7,
  1136,
  9,
  98,
  2,
  1,
  237,
  11,
  312,
  3,
  7,
  11,
  33,
  858,
  3,
  1,
  203,
  11,
  1137,
  3,
  107,
  692,
  84,
  859,
  153,
  1,
  258,
  3,
  4,
  1678,
  93,
  282,
  84,
  859,
  18,
  42,
  283,
  1679,
  1680,
  10,
  1,
  503,
  3,
  1,
  221,
  1681,
  19,
  7,
  154,
  8,
  1,
  586,
  38,
  3,
  51,
  7,
  401,
  1138,
  153,
  1,
  181,
  2,
  1682,
  21,
  9,
  152,
  3,
  17,
  23,
  1683,
  2,
  127,
  4,
  1684,
  8,
  1139,
  2,
  48,
  11,
  1,
  38,
  31,
  1,
  221,
  187,
  48,
  11,
  93,
  1140,
  21,
  24,
  3,
  168,
  2,
  451,
  38,
  2,
  7,
  30,
  860,
  2,
  16

In [8]:
tokenized_texts = np.array(tokenized_texts).flatten()
tokenized_texts

array([ 48,  11, 102, ..., 596,   3,  27])

In [9]:
tokenizer.word_index

{'the': 1,
 ',': 2,
 '.': 3,
 'a': 4,
 'and': 5,
 'to': 6,
 'he': 7,
 'of': 8,
 'his': 9,
 'in': 10,
 'was': 11,
 'you': 12,
 'that': 13,
 'it': 14,
 'had': 15,
 'i': 16,
 'but': 17,
 'with': 18,
 'as': 19,
 'they': 20,
 'for': 21,
 'at': 22,
 'not': 23,
 'him': 24,
 'very': 25,
 'so': 26,
 '|': 27,
 'one': 28,
 'on': 29,
 'said': 30,
 '!': 31,
 'when': 32,
 'all': 33,
 'up': 34,
 'out': 35,
 'were': 36,
 'have': 37,
 'wolf': 38,
 'fox': 39,
 "'s": 40,
 'be': 41,
 'them': 42,
 'is': 43,
 'from': 44,
 'do': 45,
 'would': 46,
 'could': 47,
 'there': 48,
 'what': 49,
 'me': 50,
 'then': 51,
 'their': 52,
 '?': 53,
 'her': 54,
 'an': 55,
 'lion': 56,
 'by': 57,
 'who': 58,
 'into': 59,
 'about': 60,
 'if': 61,
 'ass': 62,
 'no': 63,
 'day': 64,
 'your': 65,
 'away': 66,
 'down': 67,
 'much': 68,
 'saw': 69,
 'this': 70,
 'she': 71,
 'mouse': 72,
 'get': 73,
 'did': 74,
 'now': 75,
 'been': 76,
 'my': 77,
 'time': 78,
 'made': 79,
 'are': 80,
 'how': 81,
 'will': 82,
 'soon': 83,
 'came': 8

In [10]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

3076

## Loading GloVe and creating embedding matrix

In [11]:
PATH = '../input/glove-global-vectors-for-'\
        'word-representation/glove.6B.100d.txt'

glove = pd.read_table(PATH, sep=" ", 
                      index_col=0, header=None, 
                      quoting=csv.QUOTE_NONE)

In [12]:
glove.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062
",",-0.10767,0.11053,0.59812,-0.54361,0.67396,0.10663,0.038867,0.35481,0.06351,-0.094189,...,0.34951,-0.7226,0.37549,0.4441,-0.99059,0.61214,-0.35111,-0.83155,0.45293,0.082577
.,-0.33979,0.20941,0.46348,-0.64792,-0.38377,0.038034,0.17127,0.15978,0.46619,-0.019169,...,-0.063351,-0.67412,-0.068895,0.53604,-0.87773,0.31802,-0.39242,-0.23394,0.47298,-0.028803
of,-0.1529,-0.24279,0.89837,0.16996,0.53516,0.48784,-0.58826,-0.17982,-1.3581,0.42541,...,0.18712,-0.018488,-0.26757,0.727,-0.59363,-0.34839,-0.56094,-0.591,1.0039,0.20664
to,-0.1897,0.050024,0.19084,-0.049184,-0.089737,0.21006,-0.54952,0.098377,-0.20135,0.34241,...,-0.13134,0.058617,-0.31869,-0.61419,-0.62393,-0.41548,-0.038175,-0.39804,0.47647,-0.15983


In [13]:
glove.shape

(400000, 100)

In [14]:
embedding_matrix = np.zeros((vocab_size,100))
for word, i in tqdm(tokenizer.word_index.items()):
    try:
        embedding_vector = glove.loc[word].values
    except KeyError:
        pass
    embedding_matrix[i] = embedding_vector
    
    # e.g. The word "the" will be placed at index 1

100%|██████████| 3075/3075 [01:00<00:00, 50.80it/s]


## Determine predictor variables (X) and the target (y)

In [15]:
X = []
y = []
for i in range(len(tokenized_texts)):
    if i+20 < len(tokenized_texts):
        start = i
        end = i+20

        X.append(tokenized_texts[start:end])
        y.append(tokenized_texts[end])
        
X = np.array(X)
y = np.array(y)

## Convert target to categorical data (one-hot encoding)

In [16]:
y_one_hot = to_categorical(y, num_classes=vocab_size)
y_one_hot.shape

(27415, 3076)

## Train test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, 
                                                    test_size=0.2, 
                                                    random_state=99)

In [18]:
print(X_train.shape)
print(X_test.shape)

(21932, 20)
(5483, 20)


## Construct the LSTM-based neural network model

In [19]:
lstm_input = Input(shape=(None,))
lstm = Embedding(input_dim=vocab_size, 
                 output_dim=100, 
                 weights=[embedding_matrix],
                 trainable=True)(lstm_input)
lstm = LSTM(256)(lstm)
lstm_output = Dense(vocab_size, activation='softmax')(lstm)

In [20]:
model = Model(lstm_input, lstm_output)

In [21]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         307600    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 3076)              790532    
Total params: 1,463,700
Trainable params: 1,463,700
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['acc'])

In [23]:
history = model.fit(X_train, y_train, 
                    epochs=20, batch_size=32, 
                    validation_data=(X_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Function to tell the neural network to write

In [24]:
def write(text):
    while text[-1] != '|':
        # Preprocessing
        text = preprocess(text)
        tokenized_text = tokenizer.texts_to_sequences([text])
        tokenized_text = np.array(tokenized_text).flatten()
        
        # Predicting the next word
        pred_one_hot = model.predict(np.array([tokenized_text]))
        pred_token = np.argmax(pred_one_hot)
        
        # Decoding the predicted word
        pred_text = tokenizer.sequences_to_texts([[pred_token]])
        
        # Decoding the previous words
        text = tokenizer.sequences_to_texts([tokenized_text])

        # Concatenating the previous words and the new word
        text = text[0]+' '+pred_text[0]
        
    return text

In [29]:
sentence = 'I am so hungry'
write(sentence)

'i am so hungry ! last he asked with a laugh when he had hired the ass and had made him had to eat himself , and his head and his son came down down the stag , and rising on his hind legs , and began a wild dance . first he whirled round and saw that all his strength were in a different sort of silver . |'