In [0]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
tweets = pd.read_csv('/content/gdrive/My Drive/for_tensorflow.csv')

In [4]:
tweets.head()

Unnamed: 0,clean
0,rais hand question jr ranger club member get e...
1,better way kick vacat settl watch win move 2nd...
2,proud announc today multi-year panership globe...
3,live radio thur may 30th today 4pm pt/7pm et a...
4,septemb retir nolan ryan ballpark arlington fi...


In [0]:
tweets.dropna(inplace=True)
tweets = tweets.reindex()

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 3000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(tweets.clean)


Using TensorFlow backend.


In [0]:
X = tokenizer.texts_to_sequences(tweets.clean.astype(str))
X = pad_sequences(X)

In [0]:
word_dataset = tf.data.Dataset.from_tensor_slices(X)

In [11]:
word_dataset

<DatasetV1Adapter shapes: (18,), types: tf.int32>

In [0]:
vocab_size = len(tokenizer.word_index) + 1

In [0]:
seq_length = 128
sequences = word_dataset.batch(seq_length+1, drop_remainder=True)

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
# Batch size 
BATCH_SIZE = 128
examples_per_epoch = len(tweets)//seq_length
steps_per_epoch = 300

# # Buffer size to shuffle the dataset
# # (TF data is designed to work with possibly infinite sequences, 
# # so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# # it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE)

# dataset

In [16]:
dataset

<DatasetV1Adapter shapes: ((128, 18), (128, 18)), types: (tf.int32, tf.int32)>

In [0]:
n_cols = X.shape[1]


# The embedding dimension 
embedding_dim = 500

# Number of RNN units
rnn_units = 3072


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    print (n_cols)
    model = tf.keras.Sequential([
     
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.LSTM(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.LSTM(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [18]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
    print('GPU Running')
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')
    
model = build_model(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

GPU Running
18


In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 500)          912000    
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (128, None, 3072)         32937984  
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (128, None, 3072)         56641536  
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (128, None, 3072)         56641536  
_________________________________________________________________
dense (Dense)                (128, None, 1824)         5605152   
Total params: 152,738,208
Trainable params: 152,738,208
Non-trainable params: 0
_________________________________________________________________


In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)



In [0]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/gdrive/My Drive/training_checkpoints/complex'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "weights.{epoch:02d}-{loss:.2f}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    monitor='loss',
    save_weights_only=True,
    save_best_only=True)

In [0]:
EPOCHS=200

In [25]:
dataset

<DatasetV1Adapter shapes: ((128, 18), (128, 18)), types: (tf.int32, tf.int32)>

In [26]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=50, callbacks=[checkpoint_callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [27]:

tf.train.latest_checkpoint(checkpoint_dir)

'/content/gdrive/My Drive/training_checkpoints/complex/weights.200-0.86'

In [22]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights('/content/gdrive/My Drive/training_checkpoints/complex/weights.200-0.86')#tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

18


In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 500)            912000    
_________________________________________________________________
cu_dnngru_3 (CuDNNGRU)       (1, None, 3072)           32937984  
_________________________________________________________________
cu_dnngru_4 (CuDNNGRU)       (1, None, 3072)           56641536  
_________________________________________________________________
cu_dnngru_5 (CuDNNGRU)       (1, None, 3072)           56641536  
_________________________________________________________________
dense_1 (Dense)              (1, None, 1824)           5605152   
Total params: 152,738,208
Trainable params: 152,738,208
Non-trainable params: 0
_________________________________________________________________


In [0]:
def find_word(idx):
  return list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(idx)]

def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 10

    # Converting our start string to numbers (vectorizing) 
    input_eval = [tokenizer.word_index[s] for s in start_string.split()]
#     input_eval = tf.expand_dims(input_eval, 0)
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results ifst tsnn more surprising text.
    # Experiment to find the best setting.
    temperature = 2
   

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(logits=predictions[0], num_samples=1)[-1,0].numpy()


        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id+1], 0)

        text_generated.append(find_word(predicted_id+1))

    return (start_string + ' ' + ' '.join(text_generated))



In [41]:
for i in range(10):
  print(generate_text(model, start_string=u"josh hamilton"))

josh hamilton supervis far go late make longestlistofnighttimewinnersinnohtexa op born cocain allstar
josh hamilton late best otd anaheim best gl late go wear embarrassingli
josh hamilton among star longestlistofnighttimewinnersinnohtexa 31 otd paid tog pitch late even
josh hamilton unfoun game pitch continu realli bobbi 5 late make pitch
josh hamilton safe home lot basketbal basebal loss pitch make terribl r
josh hamilton nice far houston offenc much woodi allstar bas youth royal
josh hamilton among anaheim ray exclus super park understand height definit lead
josh hamilton among entir best level pitch akin night someth bet take
josh hamilton among best aug late make best even home yr pitch
josh hamilton helen two see pla touch want houston who competit allstar


In [53]:
%ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [57]:
%cd gdrive/My\ Drive/training_checkpoints/complex

/content/gdrive/My Drive/training_checkpoints/complex


In [58]:
%ls

checkpoint                            weights.27-4.53.data-00001-of-00002
weights.01-10.20.data-00000-of-00002  weights.27-4.53.index
weights.01-10.20.data-00001-of-00002  weights.32-4.52.data-00000-of-00002
weights.01-10.20.index                weights.32-4.52.data-00001-of-00002
weights.02-5.18.data-00000-of-00002   weights.32-4.52.index
weights.02-5.18.data-00001-of-00002   weights.34-4.51.data-00000-of-00002
weights.02-5.18.index                 weights.34-4.51.data-00001-of-00002
weights.03-5.07.data-00000-of-00002   weights.34-4.51.index
weights.03-5.07.data-00001-of-00002   weights.35-4.50.data-00000-of-00002
weights.03-5.07.index                 weights.35-4.50.data-00001-of-00002
weights.04-5.02.data-00000-of-00002   weights.35-4.50.index
weights.04-5.02.data-00001-of-00002   weights.36-4.49.data-00000-of-00002
weights.04-5.02.index                 weights.36-4.49.data-00001-of-00002
weights.05-4.96.data-00000-of-00002   weights.36-4.49.index
weights.05-4.96.data-00001-of-0000

In [0]:
%rm -rf *
