In [None]:
# import
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
path_to_file = "/Data/Friends_Transcript.txt"

In [None]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of Character {}'.format(len(text)))

Length of Character 4965384


In [None]:
print(text[:450])

THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)
Written by: Marta Kauffman & David Crane
[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]
Monica: There's nothing to tell! He's just some guy I work with!
Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!
Chandler: All right Joey, be nice. So does he have a hump? A hump and a hairpiece?
Phoebe: Wait, does he eat cha


In [None]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

95 unique characters


In [None]:
# Create a mapping from characters to numbers and vice versa
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [None]:
print("{} is mapped to {}".format(text[:10], text_as_int[:10]))

THE ONE WH is mapped to [54 42 39  2 49 48 39  2 57 42]


In [None]:
# Maximum sentence we are inputing to the RNN
seq_length =100
examples_per_epoch = len(text)//(seq_length+1)
print(examples_per_epoch)

49162


In [None]:
# Creating dataset
# from_tensor_slices is like creating a generator for our dataset and is suitable for handling
# large datasets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [None]:
# .take is like iloc in pandas
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

T
H
E
 
O


In [None]:
# .batch converts into batches of fixed size
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & D'
"avid Crane\r\n[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\r\nMonica: There's not"
"hing to tell! He's just some guy I work with!\r\nJoey: C'mon, you're going out with the guy! There's go"
'tta be something wrong with him!\r\nChandler: All right Joey, be nice. So does he have a hump? A hump a'
"nd a hairpiece?\r\nPhoebe: Wait, does he eat chalk?\r\n(They all stare, bemused.)\r\nPhoebe: Just, 'cause, "


In [None]:
def split_input_target(chunk):
    """
    This function generate input and target text from the given text.
    Input text does not contain last part and target doesnot contain first character
    """
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'THE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & '
Target data: 'HE ONE WHERE MONICA GETS A NEW ROOMATE (THE PILOT-THE UNCUT VERSION)\r\nWritten by: Marta Kauffman & D'


In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 54 ('T')
  expected output: 42 ('H')
Step    1
  input: 42 ('H')
  expected output: 39 ('E')
Step    2
  input: 39 ('E')
  expected output: 2 (' ')
Step    3
  input: 2 (' ')
  expected output: 49 ('O')
Step    4
  input: 49 ('O')
  expected output: 48 ('N')


In [None]:
# Now we need to shuffle the data and pack into batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [None]:
#Constants for model
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

#TODO: Use LSTM Layer also

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful= True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful= True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 95) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           24320     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 95)            97375     
                                                                 
Total params: 4,059,999
Trainable params: 4,059,999
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Here we are chosing the next character randomly based on its probablity
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
#idk what this line does
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices


array([25, 11, 47, 63,  7, 50, 71,  2, 68, 38, 48, 87, 69, 44,  0, 35, 69,
       23, 78, 76,  4, 74, 36,  1, 45, 52, 83, 27, 39, 69, 83, 72,  0, 50,
       54, 12, 33,  1, 14, 81, 72, 17, 92, 29, 23, 92,  9, 87, 21, 18,  7,
       29, 29, 82, 17, 77, 17, 16, 21, 86, 40, 37, 82, 76, 87, 58, 17, 34,
       75, 25, 28, 66, 11,  6, 94, 55, 30,  4, 88, 57, 30, 10, 61,  7, 91,
       79, 46, 59, 13, 80, 38, 85, 30, 53, 37, 27, 28, 69, 24, 39])

In [None]:
#Decoding what this means
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 "know what? Just did.  CHANDLER: Really? Wow. That's some pretty powerful imaginary sperm you must ha"

Next Char Predictions: 
 '7)M^%Pf cDNvdJ\nAd5mk"iB\rKRr9Edrg\nPT*?\r,pg/{;5{\'v30%;;q/l/.3uFCqkvX/@j7:a)$}U<"wW<([%znLY+oDt<SC9:d6E'


In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
model.load_weights('/content/training_checkpoints/ckpt_21')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5b80030110>

In [None]:
EPOCHS = 16
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
 27/768 [>.............................] - ETA: 44s - loss: 1.7675

KeyboardInterrupt: ignored

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

'/content/training_checkpoints/ckpt_14'

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights('/content/training_checkpoints/ckpt_1')
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 256)            24320     
                                                                 
 gru_4 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 gru_5 (GRU)                 (1, None, 1024)           6297600   
                                                                 
 dense_2 (Dense)             (1, None, 95)             97375     
                                                                 
Total params: 10,357,599
Trainable params: 10,357,599
Non-trainable params: 0
_________________________________________________________________


In [None]:
def generate_text(model, start_string):
  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"THE ONE WHERE"))

THE ONE WHEREERME! Shes kits so sure she's need to live to hige   stuck into a trate me first farch. That was that you're like some chance you been so loving about (Monicas.) Oh my 'Greets Or a boug'd work major!
Phoebe: Cith Tell a- ERAUA
The Woman: I'm so farm!
Monica: I don't want it. Fat called cancon. . hever still leary qurets I want all veay?
Ross: (he did. Any choy elder: Ya... AnyI matter in af or nothing kitchen. (Gassls) This is so you didn'tlica: Shot in London hard a lot Stul!
Monica: Yknoffic, I actually, what if you have?
Monica: (shocked) Really! Wow! You guys! What walk! (Starts to remember any win! Chandler thats Joey owh. It feel with propose.
Chandler easixed to trah   to that.
(Monica shocked behinds her eye) Phoebe and Rachel and Rachel are from the guy, couldn't bet our Cluck Ross when he tries to cover down.]
Chandler! I've tas Rachel.
Ross: Guit news!
Phoebe: Hey.
Joey: Joey, is that s op our hungry and now putches and the right? ( on a kind of this

In [None]:
import os, tarfile
 
import os
from google.colab import files

def make_targz_one_by_one(output_filename, source_dir):
  tar = tarfile.open(output_filename,"w")
  for root,dir_name,files_list in os.walk(source_dir):
    for file in files_list:
      pathfile = os.path.join(root, file)
      tar.add(pathfile)
  tar.close()
 
  files.download(output_filename)
 
 
make_targz_one_by_one('checkpoint(22-35)_tar', '/content/training_checkpoints')



In [None]:
# Using google drive for saving model
from google.colab import drive
drive.mount('/content/drive')