In [19]:
# import necessary libraries

import pandas as pd
import os
import glob
from tqdm import tqdm
import re
import numpy as np
import tensorflow as tf

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import sys
from tensorflow.keras.callbacks import EarlyStopping
import pickle

In [2]:
# import train data

with open('../data/output_train.txt','r',encoding='utf-8') as f:
    train_data = f.read()

In [3]:
# select subset of train data

raw_text = train_data[:150000]

In [21]:
# this can be used for encoding and decoding
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# dump chars list into pickle
with open('../data/chars.pkl', 'wb') as f:
    pickle.dump(chars, f)
    

In [5]:
# find stats for encoding
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1711
Total Vocab:  61


In [6]:
# create a basic preprocessing steps
# https://stackoverflow.com/questions/43018030/replace-apostrophe-short-words-in-python
# https://stackoverflow.com/questions/49073673/include-punctuation-in-keras-tokenizer

def nlp_preprocessing(total_text):
    if type(total_text) is not int:
    
        # replace multiple spaces with single space
        total_text = re.sub('\s+', ' ', total_text)
#         total_text = re.sub('\s+',' ', total_text)
        
        #This is to be done because I want to Include Period and Question mark in keras tokenizer.
        #I do not want the Tokenizer API to remove them
        total_text = total_text.replace(".", " .")
        total_text = total_text.replace("?", " ?")
        
        # specific
        total_text = re.sub(r"won\'t", "will not", total_text)
        total_text = re.sub(r"can\'t", "can not", total_text)
        total_text = re.sub(r"\x00", "", total_text)
        # general
        total_text = re.sub(r"n\'t", " not", total_text)
        total_text = re.sub(r"\'re", " are", total_text)
        total_text = re.sub(r"\'s", " is", total_text)
        total_text = re.sub(r"\'d", " would", total_text)
        total_text = re.sub(r"\'ll", " will", total_text)
        total_text = re.sub(r"\'t", " not", total_text)
        total_text = re.sub(r"\'ve", " have", total_text)
        total_text = re.sub(r"\'m", " am", total_text)
        # converting all the chars into lower-case.
        total_text = total_text.lower().strip()
        
        
        return total_text
    
raw_text = nlp_preprocessing(raw_text)

In [11]:

...
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  1611


In [12]:

# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

In [13]:

# define the LSTM model
# define model with one layer and 256 nodes of lstm


model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 256)               264192    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 61)                15677     
                                                                 
Total params: 279,869
Trainable params: 279,869
Non-trainable params: 0
_________________________________________________________________


In [26]:
if not os.path.exists('../model/model_train.h5'):
    es = EarlyStopping(monitor='val_loss', patience=5)
    history = model.fit(X,y, batch_size=128, epochs=20 , validation_split=0.20, callbacks=[es])
    model.save('../model/model_train.h5')

In [None]:
# decode the characters
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
# to check inference of the model

while True:
    seq = input("Enter a word : ")
    seq = nlp_preprocessing(seq)
    pattern = [char_to_int[char] for char in seq]
#     seed_ = np.pad(pattern, (0, seq_length - len(pattern)))
#     pattern = np.reshape(pattern, (1, seq_length, 1))
    print("Seed:")
    print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
    # generate characters
    for i in range(100):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(n_vocab)
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        seq_in = [int_to_char[value] for value in pattern]
        sys.stdout.write(result)
        pattern.append(index)
        pattern = pattern[1:len(pattern)]
print("\nDone.")