<a href="https://colab.research.google.com/github/Justabhi96/NLP/blob/master/11_Text_Generation_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
def read_file(filepath):
  with open(filepath) as f:
    txt = f.read()
  return txt

In [0]:
moby_text = read_file("moby_dick_four_chapters.txt")
print(moby_text)

In [0]:
import spacy
nlp = spacy.load("en", disable = ["parser", "tagger", "ner"])

In [0]:
nlp.max_length = 1198623

In [0]:
def remove_punc(doc_text):
  return [token.text.lower() for token in nlp(doc_text) if 
          token.text not in '"\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [0]:
tokens = remove_punc(moby_text)
tokens[:20]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and']

In [0]:
len(tokens)

11338

###We will pass 25 words and have the network predict 26th word

In [0]:
train_len = 25+1

train_sequences = []
for i in range(train_len, len(tokens)):
  seq = tokens[i-train_len:i]
  train_sequences.append(seq)

In [0]:
" ".join(train_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

Notice all the sequences are shifted one step right

In [0]:
" ".join(train_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

###Tokenize the word vectors

In [0]:
from keras.preprocessing.text import Tokenizer

In [0]:
tokenizer =Tokenizer()

tokenizer.fit_on_texts(train_sequences)

In [0]:
# notice the shift in the values
sequences = tokenizer.texts_to_sequences(train_sequences)
list(zip(sequences[0], sequences[1]))

[(956, 14),
 (14, 263),
 (263, 51),
 (51, 261),
 (261, 408),
 (408, 87),
 (87, 219),
 (219, 129),
 (129, 111),
 (111, 954),
 (954, 260),
 (260, 50),
 (50, 43),
 (43, 38),
 (38, 315),
 (315, 7),
 (7, 23),
 (23, 546),
 (546, 3),
 (3, 150),
 (150, 259),
 (259, 6),
 (6, 2712),
 (2712, 14),
 (14, 24),
 (24, 957)]

In [0]:
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [0]:
# tokenizer.word_counts
vocab_size = len(tokenizer.word_counts)
vocab_size

2717

In [0]:
import numpy as np
sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

###Create Model

In [0]:
from keras.utils import to_categorical

In [0]:
X = sequences[:, :-1]
y = sequences[:, -1]

In [0]:
y = to_categorical(y, num_classes=vocab_size+1)

In [0]:
N, seq_len = X.shape
N, seq_len

(11312, 25)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [0]:
def create_model(vocab_size, seq_len):
  model = Sequential()
  model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
  model.add(LSTM(50, return_sequences=True))
  model.add(LSTM(50))
  model.add(Dense(50, activation = "relu"))

  model.add(Dense(vocab_size, activation = "softmax"))
  model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

  model.summary()
  return model

In [0]:
model = create_model(vocab_size+1, seq_len)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_3 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(X, y, batch_size=128, epochs = 30)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/30





Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fc566ddadd8>

In [0]:
from pickle import dump, load

In [0]:
model.save("moby_model.h5")

In [0]:
dump(tokenizer, open("moby_tokenizer", "wb"))

In [0]:
from keras.preprocessing.sequence import pad_sequences

In [0]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_text):
  output_text = []
  input_text = seed_text

  for i in range(num_gen_text):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text], maxlen = seq_len, truncating = "pre")

    pred_word_index = model.predict_classes(pad_encoded, verbose = 0)[0]

    pred_word = tokenizer.index_word[pred_word_index]
    input_text += " "+pred_word

    output_text.append(pred_word)

  return " ".join(output_text)

In [0]:
import random
rand_seed_text = train_sequences[random.randint(0, len(train_sequences))]
seed_text = " ".join(rand_seed_text)
seed_text

'considered the matter a moment and then up stairs we went and i was ushered into a small room cold as a clam and furnished sure'

In [0]:
generate_text(model, tokenizer, seq_len, seed_text, 25)

'was a room and a room and a room and a room and a room and a room and a room and a room and'

####It is not a good model I suppose. 

####So let's load a good trained model with more accuracy

In [0]:
from keras.models import load_model

In [0]:
loaded_model = load_model("epochBIG.h5")
loaded_tokenizer = load(open("epochBIG", "rb"))

In [0]:
loaded_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            431400    
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_2 (Dense)              (None, 17256)             2605656   
Total params: 3,345,906
Trainable params: 3,345,906
Non-trainable params: 0
_________________________________________________________________


In [0]:
generate_text(loaded_model, loaded_tokenizer, seq_len, seed_text, 25)

"enough to lift her sense of bulkington it 's the devil and what 's the matter with him i am strongly than before his own"

In [0]:
import random
rand_seed_text = train_sequences[random.randint(0, len(train_sequences))]
seed_text = " ".join(rand_seed_text)
print("================= Seed Text ==========================\n")
print(seed_text)
print("\n================= Predicted Text ==========================\n")
print(generate_text(loaded_model, loaded_tokenizer, seq_len, seed_text, 25))


pot you have been lording it as a country schoolmaster making the tallest boys stand in awe of you the transition is a keen one i


assure you it is monsieur consider the more compasses we 'll not he ever propose into board to ringbolts to construct final hand on the
