In [0]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time
import pandas as pd

In [46]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [47]:
# Read, then decode for py2 compat.
text = open('/content/gdrive/My Drive/for_tensorflow.csv', 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 32938 characters


In [48]:
# Take a look at the first 250 characters in text
print(text[:250])

clean
rais hand question jr ranger club member get exclus q/a hunter satu
better way kick vacat settl watch win move 2nd wildcard spot
proud announc today multi-year panership globe life field offici exclus
live radio thur may 30th today 4pm pt/7pm e


In [49]:
sorted(set(text.split()))

["''",
 '+115',
 '+148',
 '1-0',
 '1/2',
 '10-0',
 '10-2',
 '10-day',
 '100th',
 '1020am/1120am/1220pm/120pm/220pm',
 '1053thefandfw',
 '10k+',
 '10th',
 '11-0',
 '11-game',
 '11th',
 '11u',
 '12-4',
 '12th',
 '14-0',
 '14-yr',
 '15-5',
 '17-0',
 '17-yr',
 '18-9',
 '1st',
 '2-0',
 '21st',
 '24-23',
 '276/349/449',
 '28th',
 '2nd',
 '3-1',
 '3-2',
 '3-5',
 '30th',
 '31st',
 '38th',
 '3rd',
 '4-2',
 '4-run',
 '4-year-old',
 '40th',
 '44-2',
 '45th',
 '47k',
 '4pm',
 '4th',
 '5-1',
 '5-27',
 '5-30',
 '5/31',
 '500kva',
 '56min',
 '5th',
 '5x',
 '6th',
 '6xall',
 '7-1',
 '7-year',
 '730pm',
 '7th',
 '8th',
 '9th',
 '``',
 'a-rod',
 'aaron',
 'ab',
 'abl',
 'absolut',
 'academi',
 'account',
 'ace',
 'across',
 'act',
 'acti',
 'activ',
 'actual',
 'ad',
 'adam',
 'add',
 'adjust',
 'adopt',
 'adrian',
 'advantag',
 'adventuretravel',
 'advic',
 'afford',
 'afternoon',
 'age',
 'ago',
 'agre',
 'ahead',
 'aicl',
 'aid',
 'air',
 'aisl',
 'akin',
 'al',
 'alarm',
 'albeo',
 'alcohol',
 'aled

In [50]:
# The unique characters in the file
vocab = sorted(set(text.split()))
print ('{} unique words'.format(len(vocab)))

1824 unique words


In [0]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text.split()])

In [52]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  "''":   0,
  '+115':   1,
  '+148':   2,
  '1-0':   3,
  '1/2':   4,
  '10-0':   5,
  '10-2':   6,
  '10-day':   7,
  '100th':   8,
  '1020am/1120am/1220pm/120pm/220pm':   9,
  '1053thefandfw':  10,
  '10k+':  11,
  '10th':  12,
  '11-0':  13,
  '11-game':  14,
  '11th':  15,
  '11u':  16,
  '12-4':  17,
  '12th':  18,
  '14-0':  19,
  ...
}


In [53]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'clean\nrais ha' ---- characters mapped to int ---- > [ 342 1268  737 1260  886 1272  348 1024  666  551 1255  812 1366]


In [54]:
# The maximum length sentence we want for a single input in characters
seq_length = 10
BATCH_SIZE = 64
examples_per_epoch = len(text)//seq_length
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

clean
rais
hand
question
jr


In [55]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(' '.join(idx2char[item.numpy()])))

'clean rais hand question jr ranger club member get exclus q/a'
'hunter satu better way kick vacat settl watch win move 2nd'
'wildcard spot proud announc today multi-year panership globe life field offici'
'exclus live radio thur may 30th today 4pm pt/7pm et amp'
'talk mlb amp surg septemb retir nolan ryan ballpark arlington first'


In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [57]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(' '.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(' '.join(idx2char[target_example.numpy()])))

Input data:  'clean rais hand question jr ranger club member get exclus'
Target data: 'rais hand question jr ranger club member get exclus q/a'


In [58]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 342 ('clean')
  expected output: 1268 ('rais')
Step    1
  input: 1268 ('rais')
  expected output: 737 ('hand')
Step    2
  input: 737 ('hand')
  expected output: 1260 ('question')
Step    3
  input: 1260 ('question')
  expected output: 886 ('jr')
Step    4
  input: 886 ('jr')
  expected output: 1272 ('ranger')


In [59]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 10), (64, 10)), types: (tf.int64, tf.int64)>

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 50

# Number of RNN units
rnn_units = 1024


In [0]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
     
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [0]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

In [64]:
for input_example_batch, target_example_batch in dataset.take(1): 
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 10, 1824) # (batch_size, sequence_length, vocab_size)


In [65]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 50)            91200     
_________________________________________________________________
gru_9 (GRU)                  (64, None, 1024)          3302400   
_________________________________________________________________
gru_10 (GRU)                 (64, None, 1024)          6294528   
_________________________________________________________________
gru_11 (GRU)                 (64, None, 1024)          6294528   
_________________________________________________________________
dense_3 (Dense)              (64, None, 1824)          1869600   
Total params: 17,852,256
Trainable params: 17,852,256
Non-trainable params: 0
_________________________________________________________________


In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)


In [0]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/gdrive/My Drive/training_checkpoints/simple'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "weights.{epoch:02d}-{loss:.2f}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    monitor='loss',
    save_weights_only=True,
    save_best_only=True)

In [0]:
EPOCHS=100

In [0]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
 3/51 [>.............................] - ETA: 3:42 - loss: 0.0967

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)

'/content/gdrive/My Drive/training_checkpoints/simple/weights.15-0.15'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [0]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (1, None, 50)             91200     
_________________________________________________________________
gru_18 (GRU)                 (1, None, 1024)           3302400   
_________________________________________________________________
gru_19 (GRU)                 (1, None, 1024)           6294528   
_________________________________________________________________
gru_20 (GRU)                 (1, None, 1024)           6294528   
_________________________________________________________________
dense_6 (Dense)              (1, None, 1824)           1869600   
Total params: 17,852,256
Trainable params: 17,852,256
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 10

    # Converting ur start string to numbers (vectorizing) 
    input_eval = [char2idx[s] for s in start_string.split()]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ' ' + ' '.join(text_generated))



In [0]:
for i in range(10):
  print(generate_text(model, start_string=u"hunter"))

hunter histori amp fan like sum perfectli great sposwrit rip gerri
hunter tour sunday stop st loui houston dalla see play time
hunter histori amp fan like sum perfectli great sposwrit rip gerri
hunter breakthrough group front offic turn proud franchis joke good morn
hunter histori amp fan like sum perfectli great sposwrit rip gerri
hunter choic tix dont know run twitter account like make day
hunter histori great amp play ranger ray tie bottom 2nd adrian
hunter penc 30th unt night globe life park first ticket purchas
hunter histori amp fan like sum perfectli great sposwrit rip couldnt
hunter histori amp fan like sum perfectli great sposwrit rip gerri


In [0]:
%ls


[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [0]:
%cd training_checkpoints/simple/

/content/gdrive/My Drive/training_checkpoints/simple


In [0]:
%rm -rf *
