In [0]:
import tensorflow as tf
import numpy as np
import os
import time

In [0]:
def token_lookup():
    mark_to_token={}
    mark_to_token['.'] = "||period||"
    mark_to_token[','] = "||comma||"
    mark_to_token['"'] = "||uotationmark||"
    mark_to_token[';'] = "||semicolon||"
    mark_to_token['!'] = "||exclamationmark||"
    mark_to_token['?'] = "||questionmark||"
    mark_to_token['('] = "||leftparentheses||"
    mark_to_token[')'] = "||rightparentheses||"
    mark_to_token['--'] = "||dash||"
    mark_to_token['\n'] = "||return||"
    mark_to_token['“'] = "||quotationMark||"
    mark_to_token['”'] = "||quotationMark||"
    return mark_to_token

In [0]:
def create_lookup_tables(text):
      #text is split into words
      words = sorted(list(set(text)))
      vocab_to_int = {word:index for index,word in enumerate(words)}
      int_to_vocab = np.array(words)
      return vocab_to_int,int_to_vocab

In [0]:
def preprocess_data(token_lookup,create_lookup_tables):
    #Reading all the scripts

    text1 = open("got1.txt",'r').read()
    text2 = open("got2.txt",'r').read()
    text3 = open("got3.txt",'r').read()
    text4 = open("got4.txt",'r').read()
    text5 = open("got5.txt",'r').read()

    text = text1+"\n\n"+text2+"\n\n"+text3+"\n\n"+text4+"\n\n"+text5+"\n\n"

    token_dict = token_lookup()

    for key,token in token_dict.items():
      text = text.replace(key,' {} '.format(token))

    text = text.lower()
    text = text.split()

    word2idx,idx2word = create_lookup_tables(text)

    words_as_int = [word2idx[word] for word in text]

    return words_as_int,word2idx,idx2word,token_dict,text

In [0]:
words_as_int,word2idx,idx2word,token_dict,words = preprocess_data(token_lookup,create_lookup_tables)

In [6]:
print("Total Number of Unique Words {}".format(len(word2idx)))

Total Number of Unique Words 26687


In [7]:
#Cross Check
text = open("got5.txt",'r').read()
print(text[:35])
print(words[:7])

“We should start back,” Gared urged
['||return||', 'the', 'comet’s', 'tail', 'spread', 'across', 'the']


In [0]:
seq_length = 50

words_dataset = tf.data.Dataset.from_tensor_slices(words_as_int) 

In [9]:
for i in words_dataset.take(5):
  print(idx2word[i])

||return||
the
comet’s
tail
spread


In [10]:
sequences = words_dataset.batch(seq_length+1,drop_remainder=True)

for seq in sequences.take(2):
  print(repr(' '.join(idx2word[seq.numpy()])))

'||return|| the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after long'
'flight ||period|| their droppings speckled the gargoyles that rose twelve feet tall on either side of him ||comma|| a hellhound and a wyvern ||comma|| two of the thousand that brooded over the walls of the ancient fortress ||period|| when first he came to dragonstone ||comma|| the army of stone grotesques had'


In [0]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text,target_text

batches = sequences.map(split_input_target)

In [12]:
for input_example,target_example in batches.take(1):
  print("Input data:",repr(" ".join(idx2word[input_example.numpy()])))
  print("Target data:",repr(" ".join(idx2word[target_example.numpy()])))

Input data: '||return|| the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after'
Target data: 'the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after long'


In [13]:
Batch_Size = 128

Buffer_Size= 1000

dataset = batches.shuffle(Buffer_Size).batch(Batch_Size,drop_remainder=True)

dataset

<BatchDataset shapes: ((128, 50), (128, 50)), types: (tf.int32, tf.int32)>

In [0]:
vocab_size = len(word2idx)

embedding_size = 400

rnn_units = [512,256]

In [0]:
def build_model(vocab_Size,embedding_size,rnn_units,batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_Size,embedding_size,
                                batch_input_shape=[batch_size,None]),
      tf.keras.layers.LSTM(rnn_units[0],recurrent_initializer='glorot_uniform',
                           return_sequences=True,stateful=True,recurrent_dropout=0.15),
      tf.keras.layers.LSTM(rnn_units[1],recurrent_initializer='glorot_uniform',
                           return_sequences=True,stateful=True,recurrent_dropout=0.1),
      tf.keras.layers.Dense(vocab_size)
  ])

  return model

In [136]:
model = build_model(vocab_size,embedding_size,rnn_units,128)



In [137]:
for input_example_batch,input_target_batch in dataset.take(1):
  example_batch_prediction = model(input_example_batch)
  print(example_batch_prediction.shape)

(128, 50, 26687)


In [138]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (128, None, 400)          10674800  
_________________________________________________________________
lstm_14 (LSTM)               (128, None, 512)          1869824   
_________________________________________________________________
lstm_15 (LSTM)               (128, None, 256)          787456    
_________________________________________________________________
dense_7 (Dense)              (128, None, 26687)        6858559   
Total params: 20,190,639
Trainable params: 20,190,639
Non-trainable params: 0
_________________________________________________________________


In [0]:
sample_indices = tf.random.categorical(example_batch_prediction[0],num_samples=1)
sampled_indices= tf.squeeze(sample_indices,axis=-1).numpy()

In [140]:
sampled_indices

array([   96, 12538, 11171, 18879,  6488, 11466, 13348, 14843,  2813,
       21572, 24122, 16380, 19012, 13688, 25938, 25486, 17907,  5281,
       25145,  1481, 20780,  7958,  4901,  4979, 19757, 19865,  9757,
       18637,  6060, 19399, 10834, 17920,   303,  1653, 12822,  4265,
        7479,  4734,  9686,  4311, 19131,  5974, 26537,  7181, 21722,
       13422, 20936,   406, 18909,  1644])

In [141]:
print(" ".join(idx2word[sampled_indices]))
#As you can see that model just outputs gibberish,

access laggardly hugged rookery dripped impulse loutish namesake bryen—didn’t squish tyrion petting rue— manes wine-sodden weighs ravine dare waits basted sluggishly firmly crewed crossbowman senses seven-times-damned gritted riddle dissemble scapegoat hisself rayder affirmation been—she legitimacy compounded fact cower greyguard’s condolences s-s-simple dishes —to escorted statuary lumpier snarling— aisles rosby’s beech


In [0]:
checkpoint_prefix = os.path.join("model",'ckpt_{epoch}')
chkp = tf.keras.callbacks.ModelCheckpoint(checkpoint_prefix,save_weights_only=True,verbose=1)

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [0]:
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
    grads = tape.gradient(loss, model.trainable_variables)
    clipped_grads = [tf.clip_by_value(grad,-2.,2.) for grad in grads if grad is not None]
    optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))

  return loss

In [0]:
model.load_weights(tf.train.latest_checkpoint("model"))
EPOCHS=200
for epoch in range(EPOCHS):
  start=time.time()
  hidden = model.reset_states()

  for (batch_n,(inp,target)) in enumerate(dataset):

    loss = train_step(inp,target)

    if batch_n % 100 == 0:
        template = 'Epoch {} Batch {} Loss {}'
        print(template.format(epoch+1, batch_n, loss))
    
  if (epoch + 1) % 5 == 0:
      model.save_weights(checkpoint_prefix.format(epoch=epoch))
  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

In [0]:
Batch_Size = 1

model = build_model(vocab_size,embedding_size,rnn_units,Batch_Size)
model.load_weights(tf.train.latest_checkpoint('model'))
model.build(tf.TensorShape([1,None]))

In [79]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 400)            10674800  
_________________________________________________________________
lstm_10 (LSTM)               (1, None, 512)            1869824   
_________________________________________________________________
lstm_11 (LSTM)               (1, None, 256)            787456    
_________________________________________________________________
dense_5 (Dense)              (1, None, 26687)          6858559   
Total params: 20,190,639
Trainable params: 20,190,639
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model,start_string):
  #Number of words to generate
  num_generate = 40
  #Converting start string to numbers
  input_eval = [word2idx[s] for s in start_string.split()]
  
  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []
  temperature =1.0

  model.reset_states()

  for i in range(num_generate):
    prediction = model(input_eval)

    prediction =tf.squeeze(prediction,0)

    prediction= prediction/temperature

    predicted_id = tf.random.categorical(prediction,num_samples=1)[-1,0].numpy()

    input_eval=tf.expand_dims([predicted_id],0)

    text_generated.append(idx2word[predicted_id])

  text = start_string+" ".join(text_generated)


  #I am genrating the token dict again :
  def token_lookup():
    mark_to_token={}
    mark_to_token['.'] = "||period||"
    mark_to_token[','] = "||comma||"
    mark_to_token['"'] = "||quotationmark||"
    mark_to_token[';'] = "||semicolon||"
    mark_to_token['!'] = "||exclamationmark||"
    mark_to_token['?'] = "||questionmark||"
    mark_to_token['('] = "||leftparentheses||"
    mark_to_token[')'] = "||rightparentheses||"
    mark_to_token['--'] = "||dash||"
    mark_to_token['\n'] = "||return||"
    return mark_to_token

  #Conerting the tokens back to their markers
  token_dict = token_lookup()
  for key,token in token_dict.items():
    text = text.replace(token,key)

  return text

In [0]:
text = generate_text(model,start_string=u"jon")

In [0]:
print(text)