In [0]:
import tensorflow as tf
import numpy as np
import os
import time

In [0]:
def token_lookup():
    mark_to_token={}
    mark_to_token['.'] = "||period||"
    mark_to_token[','] = "||comma||"
    mark_to_token['"'] = "||quotationmark||"
    mark_to_token[';'] = "||semicolon||"
    mark_to_token['!'] = "||exclamationmark||"
    mark_to_token['?'] = "||questionmark||"
    mark_to_token['('] = "||leftparentheses||"
    mark_to_token[')'] = "||rightparentheses||"
    mark_to_token['--'] = "||dash||"
    mark_to_token['\n'] = "||return||"
    mark_to_token['“'] = "||quotationMark||"
    mark_to_token['”'] = "||quotationMark||"
    return mark_to_token

In [0]:

def create_lookup_tables(text):
      #text is split into words
      words = sorted(list(set(text)))
      vocab_to_int = {word:index for index,word in enumerate(words)}
      int_to_vocab = np.array(words)
      return vocab_to_int,int_to_vocab

In [0]:
def preprocess_data(token_lookup,create_lookup_tables):
    #Reading all the scripts

    text1 = open("got1.txt",'r').read()
    text2 = open("got2.txt",'r').read()
    text3 = open("got3.txt",'r').read()
    text4 = open("got4.txt",'r').read()
    text5 = open("got5.txt",'r').read()

    text = text1+"\n\n"+text2+"\n\n"+text3+"\n\n"+text4+"\n\n"+text5+"\n\n"

    token_dict = token_lookup()

    for key,token in token_dict.items():
      text = text.replace(key,' {} '.format(token))

    text = text.lower()
    text = text.split()

    word2idx,idx2word = create_lookup_tables(text)

    words_as_int = [word2idx[word] for word in text]

    return words_as_int,word2idx,idx2word,token_dict,text

In [0]:
words_as_int,word2idx,idx2word,token_dict,words = preprocess_data(token_lookup,create_lookup_tables)

In [6]:
print("Total Number of Unique Words {}".format(len(word2idx)))

Total Number of Unique Words 26687


In [7]:
#Cross Check
text = open("got5.txt",'r').read()
print(text[:35])
print(words[:7])

“We should start back,” Gared urged
['||return||', 'the', 'comet’s', 'tail', 'spread', 'across', 'the']


In [0]:
seq_len = 50
words_dataset = tf.data.Dataset.from_tensor_slices(words_as_int)

In [9]:
for i in words_dataset.take(5):
  print(idx2word[i])

||return||
the
comet’s
tail
spread


In [10]:
sequences = words_dataset.batch(seq_len+1,drop_remainder=True)

for seq in sequences.take(1):
  print(repr(' '.join(idx2word[seq.numpy()])))

'||return|| the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after long'


In [0]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text,target_text

batches = sequences.map(split_input_target)

In [12]:
for input_example,target_example in batches.take(1):
  print("Input data:",repr(" ".join(idx2word[input_example.numpy()])))
  print("Target data:",repr(" ".join(idx2word[target_example.numpy()])))

Input data: '||return|| the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after'
Target data: 'the comet’s tail spread across the dawn ||comma|| a red slash that bled above the crags of dragonstone like a wound in the pink and purple sky ||period|| ||return|| ||return|| the maester stood on the windswept balcony outside his chambers ||period|| it was here the ravens came ||comma|| after long'


In [13]:
Batch_Size = 128

Buffer_Size= 1000

dataset = batches.shuffle(Buffer_Size).batch(Batch_Size,drop_remainder=True)

dataset


<BatchDataset shapes: ((128, 50), (128, 50)), types: (tf.int32, tf.int32)>

In [0]:
vocab_size = len(word2idx)

embedding_size = 400

lstm_units = [512,256]



In [0]:
class model(tf.keras.Model):
  def __init__(self,vocab_size,embedding_dim,lstm_units,batch_sz):
    super(model,self).__init__()
    self.lstm_units = lstm_units
    self.batch_sz = batch_sz
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.lstm_1 = tf.keras.layers.LSTM(self.lstm_units[0],
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform',
                                       recurrent_dropout = 0.20)
    
    self.lstm_2 = tf.keras.layers.LSTM(self.lstm_units[0],
                                      return_sequences=True,
                                      return_state=True,
                                      go_backwards=True,
                                       recurrent_dropout=0.20)
    self.lstm_3 = tf.keras.layers.LSTM(self.lstm_units[1],
                                      return_sequences=True,
                                      return_state=True,
                                       )
  
    self.dense_3 = tf.keras.layers.Dense(vocab_size)


  def call(self,inp,states_1,states_2,states_3):
    x = self.embedding(inp)
    #Bidirectional
    out_1,states_1[0],states_1[1] = self.lstm_1(x,initial_state=states_1)
    out_2,states_2[0],states_2[1] = self.lstm_2(x,initial_state=states_2)
    #Concating the result
    out_concat = tf.concat([out_1,out_2],axis=2)  
    #Simple LSTM
    out_3,states_3[0],states_3[1] = self.lstm_3(out_concat,initial_state=states_3)

    #Final Layer for making Predictions

    output = self.dense_3(out_3)

    return output,states_1,states_2,states_3

  def initialize_cell_states(self):
        states_1 = [tf.zeros([self.batch_sz,self.lstm_units[0]]),tf.zeros([self.batch_sz,self.lstm_units[0]])]
        states_2 = [tf.zeros([self.batch_sz,self.lstm_units[0]]),tf.zeros([self.batch_sz,self.lstm_units[0]])]
        states_3 = [tf.zeros([self.batch_sz,self.lstm_units[1]]),tf.zeros([self.batch_sz,self.lstm_units[1]])]
        return (states_1,states_2,states_3)


In [16]:
model = model(vocab_size,embedding_size,lstm_units,Batch_Size)

states_1,states_2,states_3 = model.initialize_cell_states()



In [17]:
for input_example_batch,input_target_batch in dataset.take(1):
  example_batch_prediction,states_1,states_2,states_3 = model(
                                        input_example_batch,states_1,states_2,states_3)
  print(example_batch_prediction.shape)

(128, 50, 26687)


In [18]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  10674800  
_________________________________________________________________
lstm (LSTM)                  multiple                  1869824   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  1869824   
_________________________________________________________________
lstm_2 (LSTM)                multiple                  1311744   
_________________________________________________________________
dense (Dense)                multiple                  6858559   
Total params: 22,584,751
Trainable params: 22,584,751
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.load_weights(tf.train.latest_checkpoint("model"))
optimizer = tf.keras.optimizers.Adam()

def train_step(inp,targ,states_1,states_2,states_3):
  with tf.GradientTape() as tape:
    output,states_1,states_2,states_3 = model(inp,states_1,states_2,states_3)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(targ,output,from_logits=True)
    )
  grads = tape.gradient(loss,model.trainable_variables)
  clipped_grads = [tf.clip_by_value(grad,-1.,1.) for grad in grads if grad is not None]
  optimizer.apply_gradients(zip(clipped_grads,model.trainable_variables))

  return loss

In [0]:
checkpoint_prefix = os.path.join("model","{epoch}")

In [0]:
EPOCHS = 100
steps_per_epoch = len(words_as_int)//Batch_Size
for epoch in range(EPOCHS):
  start = time.time()
  states_1,states_2,states_3 = model.initialize_cell_states()
  for (batch_n,(inp,targ)) in enumerate(dataset):
    loss = train_step(inp,targ,states_1,states_2,states_3)
    if batch_n % 50 == 0:
        template = 'Epoch {} Batch {} Loss {}'
        print(template.format(epoch+1, batch_n, loss))
    
  if (epoch + 1) % 5 == 0:
      model.save_weights(checkpoint_prefix.format(epoch=epoch))
  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

In [0]:
model.save_weights("custom")

In [16]:
model1 = model(vocab_size,embedding_size,lstm_units,1)



In [17]:
model1.load_weights(tf.train.latest_checkpoint("model"))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f23a012d630>

In [0]:
def generate_text(model,s_word):
  num_generate = 200

  input_eval = [word2idx[s] for s in s_word.split(" ")]

  input_eval = tf.expand_dims(input_eval,0)
  text_generated = []
  temperature = 1.0
  states_1,states_2,states_3 = model.initialize_cell_states()
  for i in range(num_generate):
    prediction,states_1,states_2,states_3 = model(
        input_eval,states_1,states_2,states_3)
    prediction = prediction/temperature
    prediction_id = tf.random.categorical(prediction[0,:,:],num_samples=1)[-1,0].numpy()

    input_eval = tf.expand_dims([prediction_id],0)
    text_generated.append(idx2word[prediction_id])

  text = s_word+ " "+" ".join(text_generated)


  #I am genrating the token dict again :
  def token_lookup():
    mark_to_token={}
    mark_to_token['.'] = "||period||"
    mark_to_token[','] = "||comma||"
    mark_to_token['"'] = "||quotationmark||"
    mark_to_token[';'] = "||semicolon||"
    mark_to_token['!'] = "||exclamationmark||"
    mark_to_token['?'] = "||questionmark||"
    mark_to_token['('] = "||leftparentheses||"
    mark_to_token[')'] = "||rightparentheses||"
    mark_to_token['--'] = "||dash||"
    mark_to_token['\n'] = "||return||"
    return mark_to_token

  #Conerting the tokens back to their markers
  token_dict = token_lookup()
  for key,token in token_dict.items():
    text = text.replace(token,key)

  return text

In [0]:
text = generate_text(model1,u"jon")

In [20]:
print(text) #The model is quite powerful and to obtain good results we have to train the model for a longer time
#But for now I have trained it only for 50 epochs.

jon had checked the corpse with the rangers in the look of the children of the handholds and drogo . haggo watched , and their voices shouted in their knees , as she reeled bonelessly to the table . and lay a pox on its wound . when she heard the help , she had missed bran but rise before they might die . there was something so terrifying that he was not doing that it would take some ways at mud , and then she does the weaken ; the young septon did not mean to lift the fist to return to an old one with dice . 
 
 father had been holding eggs and heart danced in the whispering iron feel out , hummocks began to play off . 
 
 lord edmure prompted . 
 
 everyone waved up as he sipped petals from his stable , drawing for blood and gold expanded cask of dragons in reserve and great men in the common room . yet she would be back . dany had sent breath and threw her head again , and many mounted mates in chains . at evenfall , it’s nineteen if it happened
