In [None]:
import pandas as pd    
import numpy as np    
import string    
import os    
      
from keras.preprocessing.sequence import pad_sequences    
from keras.layers import Embedding, LSTM, Dense, Dropout    
from keras.preprocessing.text import Tokenizer    
from keras.callbacks import EarlyStopping    
from keras.models import Sequential    
import keras.utils as ku    



In [None]:
def preprocess():
      f = open("way_station.txt", "r")
  
      raw_lines = f.readlines()
      proc_line = " ".join(raw_lines).replace("\n", "").replace("\"", "").replace("...", ".").lower()
      no_white_line = " ".join(proc_line.split())
      return no_white_line.split(".")
  
      f.close()

In [None]:
def get_sequence_of_tokens(tokenizer, corpus):
      tokenizer.fit_on_texts(corpus)
      total_words = len(tokenizer.word_index) + 1
  
      input_sequences = []
      for line in corpus:
          token_list = tokenizer.texts_to_sequences([line])[0]
          for i in range(1, len(token_list)):
              n_gram_sequence = token_list[:i+1]
              input_sequences.append(n_gram_sequence)
      return input_sequences, total_words


In [None]:
def generate_padded_sequences(input_sequences, total_words):
      max_sequence_len = max([len(x) for x in input_sequences])
      input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
      
      predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
      label = ku.to_categorical(label, num_classes=total_words)
      return predictors, label, max_sequence_len


In [None]:
def create_model(max_sequence_len, total_words):
      input_len = max_sequence_len - 1
      model = Sequential()    
      model.add(Embedding(total_words, 10, input_length=input_len))
      model.add(LSTM(100))
      model.add(Dropout(0.1))
      model.add(Dense(total_words, activation='softmax'))
  
      model.compile(loss='categorical_crossentropy', optimizer='adam')
      return model


In [None]:
def generate_text(tokenizer, seed_text, next_words, model, max_sequence_len):
      for _ in range(next_words):
          token_list = tokenizer.texts_to_sequences([seed_text])[0]
          token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
          predicted = model.predict_classes(token_list, verbose=0)
          
          output_word = ""
          for word,index in tokenizer.word_index.items():
              if index == predicted:
                  output_word = word
                  break
          seed_text += " "+output_word
      return seed_text

In [None]:
def main():
      corpus = preprocess()
      tokenizer = Tokenizer()
      seqs, words = get_sequence_of_tokens(tokenizer, corpus)
  
      predictors, label, max_sequence_len = generate_padded_sequences(seqs, words)
  
      model = create_model(max_sequence_len, words)
      model.summary()
  
      model.fit(predictors, label, epochs=50)

      print(generate_text(tokenizer, "He loved messing around with", 20, model, max_sequence_len))
      print(generate_text(tokenizer, "He went down the hill and", 20, model, max_sequence_len))
      print(generate_text(tokenizer, "Enoch Wallace fired and reloaded", 20, model, max_sequence_len))
      print(generate_text(tokenizer, "Somewhere in the distance was the sound", 20, model, max_sequence_len))
      print(generate_text(tokenizer, "The postman was not coming early today, because", 20, model, max_sequence_len))
      print(generate_text(tokenizer, "The dawn was early today", 20, model, max_sequence_len))
  
      print(generate_text(tokenizer, "He loved messing around with", 10, model, max_sequence_len))
      print(generate_text(tokenizer, "He went down the hill and", 8, model, max_sequence_len))
      print(generate_text(tokenizer, "Enoch Wallace fired and reloaded", 6, model, max_sequence_len))
      print(generate_text(tokenizer, "Somewhere in the distance was the sound", 6, model, max_sequence_len))
      print(generate_text(tokenizer, "The postman was not coming early today, because", 3, model, max_sequence_len))
      print(generate_text(tokenizer, "The dawn was early today", 5, model, max_sequence_len))

main()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 161, 10)           43150     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 4315)              435815    
Total params: 523,365
Trainable params: 523,365
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0
