[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sascha-senko/TensorflowCourse/blob/main/ANNwTFHW10.ipynb)

# General stuff

In [13]:
import matplotlib.pyplot as plt
import numpy as np
import sys
import random
%load_ext tensorboard
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Conv2DTranspose, \
 Reshape, MaxPooling2D, Dropout, BatchNormalization, UpSampling2D, ReLU, \
 ELU, Layer, Embedding
from tensorflow import debugging as debug
import tensorflow_probability as tfp
from functools import partial
import nltk
nltk.download("punkt")
import re
from collections import Counter

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Clear any logs from previous runs
%rm -rf ./logs/

## Helper functions

In [3]:
def data_pipeline(data):
    """ helper function for data pipeline - does all the things we need """

    data = data.shuffle(buffer_size=SHUFFLE_SIZE)
    data = data.batch(BATCH_SIZE)
    data = data.prefetch(PREFETCH_SIZE)
    return data

@tf.function
def train(model, optimizer, loss_tracker, accuracy_tracker, train_data, num_epoch):
    for epoch in tf.range(num_epochs):
        tf.print('Epoch: ' + str(epoch+1))
        # reset statistics
        loss_tracker.reset_states()
        accuracy_tracker.reset_states()

        for inputs, labels in train_data:
            
            loss, accuracy = skip_gram_train_step(model, inputs, labels, optimizer)

            loss_tracker.update_state(loss)
            accuracy_tracker.update_state(accuracy)

        # Write statistics into summary
        with train_writer.as_default():
            tf.summary.scalar('loss', loss_tracker.result(), step=epoch)
            tf.summary.scalar('accuracy', accuracy_tracker.result(), step=epoch)    

## Load dataset

In [4]:
whole_text = tfds.load('tiny_shakespeare', split='train')
for total_text in whole_text:
    whole_text = str(total_text['text'].numpy())

[1mDownloading and preparing dataset tiny_shakespeare/1.0.0 (download: Unknown size, generated: 1.06 MiB, total: 1.06 MiB) to /root/tensorflow_datasets/tiny_shakespeare/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete0IZMPZ/tiny_shakespeare-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete0IZMPZ/tiny_shakespeare-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete0IZMPZ/tiny_shakespeare-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

[1mDataset tiny_shakespeare downloaded and prepared to /root/tensorflow_datasets/tiny_shakespeare/1.0.0. Subsequent calls will reuse this data.[0m


# SkipGram

## Some constants

In [5]:
CONTEXT_WINDOW_HALF = 2
# we only allow CONTEXT_WINDOW to be an even number
CONTEXT_WINDOW = 2 * CONTEXT_WINDOW_HALF
SUB_SAMPLE = 0.001
EMBED_SIZE = 64
VOCAB_SIZE = 10000
SHUFFLE_SIZE = VOCAB_SIZE
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 20
NO_SPECIAL = False
UNKNOWN_WORD = "UNKNOWN"

## Deal with words

In [6]:
# lower case everything
x_train = whole_text.lower()
# Get rid of all new lines
# TODO: make sure that this works correctly. Observed some pretty weird behavior
x_train = x_train.split(r"\\n")
words = []
# tokenize
for line in x_train:
    temp = []
    tokens = nltk.word_tokenize(line)
    # if True, get rid of all special characters
    if NO_SPECIAL:
        for token in tokens:
            if re.search('\w+', token):
                temp.append(token)
    else:
        temp = tokens            
    words.extend(temp)

x_train = words
words = Counter(words).most_common(VOCAB_SIZE)
id_to_word = [word for word, _ in words]
# garbage collect this
del words

word_to_id = {}
for id, word in enumerate(id_to_word):
    word_to_id[word] = id

# unknown words get id len(id_to_word)
id_to_word.append(UNKNOWN_WORD)

# apply word_to_id map on x_train
x_train = [word_to_id.get(x, VOCAB_SIZE) for x in x_train]

## Finish up dataset

In [7]:
y_train = []
# Get all targets
for i, input in enumerate(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF]):
    # we leave out the first and last CONTEXT_WINDOW_HALF words out
    i = i + CONTEXT_WINDOW_HALF
    # append context words as labels
    for j in range(CONTEXT_WINDOW_HALF):
        y_train.append(x_train[i-1-j])
        y_train.append(x_train[i+1+j])

assert len(y_train) == (len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW, \
"is: " + str(len(input_target_pairs)) + " must: " + str((len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW)

train_data = tf.data.Dataset.from_tensor_slices((np.repeat(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF], CONTEXT_WINDOW), y_train))

train_data = data_pipeline(train_data)

In [8]:
# # Check if correct - check only works if we leave out data_pipeline
# for i, (input, target) in enumerate(train_data):
#     print(input)
#     print(target)
#     if i == 3:
#         break

# print(x_train[:5])        

## Define SkipGram 

In [9]:
class SkipGram(Layer):
    # TODO: args need to be adjusted
    def __init__(self, vocab_size, embed_size):
        super(SkipGram, self).__init__()
        # TODO:  In the init function, where you normally define the layers, 
        # you can initialize the vocabulary and embedding size 
        # and use these in the build function to create weight matrices of the correct shape.
        
    @tf.function
    def call(self, context, x):
        # TODO:
        # get the embeddings using tf.nn.embedding lookup(). Instead of 
        # calculating the scores, we will directly calculate and 
        # return the loss using tf.nn.nce loss. Note that you do not need to 
        # compute the scores in the call function. The loss function 
        # does that with the weights and biases and returns the nce loss.

SyntaxError: ignored

## Define some constants

In [None]:
NUM_EPOCHS = 10
LEARNING_RATE = 10
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)
EMBED_TRACKER = tf.TensorArray(size=NUM_EPOCHS, dynamic_size=False)

import datetime

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
skip_gram_log_dir = 'logs/gradient_tape/' + current_time + '/skip_gram'
train_writer = tf.summary.create_file_writer(skip_gram_log_dir)

## SkipGram train step

In [None]:
@tf.function
def skip_gram_train_step(model, inputs, labels, optimizer):
    # loss_object and optimizer_object are instances of respective tensorflow classes
    with tf.GradientTape() as tape:
        # get one sample
        label = tf.random.uniform(1, maxval=CONTEXT_WINDOW, dtype=tf.dtypes.int16)
        loss = model(inputs, label)
        # TODO:
        # Using this loss function
        # you need to average over the batch manually
        debug.assert_shapes([(loss, ())])
        gradients = tape.gradient(loss, model.trainable_variables)

    # update weights  
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # record embedding
    # TODO: use EMBED_TRACKER to track embedding matrix

    # dummy value for accuracy
    return loss, 0

## Train SkipGram

In [None]:
# remove all active models for memory purposes
tf.keras.backend.clear_session()

# TODO: initialize our model
model = LSTM(HIDDEN_SIZE)

loss_tracker = tf.keras.metrics.Mean()
accuracy_tracker = tf.keras.metrics.Mean()

train(model, optimizer, loss_tracker, accuracy_tracker, train_data, NUM_EPOCH):

## Inspect embedding

In [None]:
# TODO: use EMBED_TRACKER to print out neighbors of interest words
# TODO: release memory of our tensorarray

# Shakespeare

## Some constants

In [None]:
SEQ_LENGTH = 20
SHUFFLE_SIZE = VOCAB_SIZE
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 20

## Preprocess Dataset

In [None]:
# lower case everything
x_train = whole_text.lower()
x_train = x_train
# Replace all new lines with white space
x_train = re.sub(r"\\n", " ", x_train) 

id_to_word = list(set(x_train))

word_to_id = {}
for i, word in enumerate(id_to_word):
    word_to_id[word] = i

# Map the characters to numbers
x_train = [word_to_id[x] for x in x_train]

# we will need this for target
last_num = x_train[-1]

# Create input subsequences
x_train = [x_train[i:i+SEQ_LENGTH] for i in range(len(x_train)-SEQ_LENGTH-1)]

# Generate labels
y_train = [x_train[i][-1] for i in range(1, len(x_train))] 

# add label for last subsequence
y_train.append(last_num)

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))

train_data = data_pipeline(train_data)

## Text Generation Model

In [21]:
class Shakespeare_RNN(Model):
  def __init__(self, vocab_size, embed_size=10, num_cells=4, hidden_size=256, return_sequences=False):
    super(Shakespeare_RNN, self).__init__()
    
    self.embedding = Embedding(vocab_size, embed_size)
    self.rnn = tf.keras.layers.RNN([tf.keras.layers.SimpleRNNCell(hidden_size) for _ in range(num_cells)], return_sequences)
    self.readout_layer = tf.keras.layers.Dense(units=vocab_size, activation='softmax')
        
    
  def call(self, x):
    x = self.embedding(x)
    # TODO: remove
    print(x.shape)
    x = self.rnn(x)
    print(x.shape)
    x = self.readout_layer(x)
    return x

In [22]:
model = Shakespeare_RNN(len(id_to_word))
for inputs, labels in train_data:
    print(inputs.shape)
    model(inputs)
    break

(20,)
(20, 10)


ValueError: ignored

# Run TensorBoard

In [None]:
# Open tensorboard
%tensorboard --logdir logs/gradient_tape