[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sascha-senko/TensorflowCourse/blob/main/ANNwTFHW10.ipynb)

# General stuff

In [97]:
import matplotlib.pyplot as plt
import numpy as np
import sys
import random
%load_ext tensorboard
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Conv2DTranspose, \
 Reshape, MaxPooling2D, Dropout, BatchNormalization, UpSampling2D, ReLU, \
 ELU, Layer, Embedding
from tensorflow import debugging as debug
import tensorflow_probability as tfp
from functools import partial
import nltk
nltk.download("punkt")
import re
from collections import Counter
from scipy.spatial.distance import cosine

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [98]:
# Clear any logs from previous runs
%rm -rf ./logs/

# if true, run with @tf.function. 
TF_FUNCTION = True

## Helper functions

In [99]:
def data_pipeline(data):
    """ helper function for data pipeline - does all the things we need """

    data = data.shuffle(buffer_size=SHUFFLE_SIZE)
    data = data.batch(BATCH_SIZE)
    data = data.prefetch(PREFETCH_SIZE)
    if DATA_SIZE:
        data = data.take(DATA_SIZE)
    return data

def train(model, optimizer, loss_tracker, accuracy_tracker, train_data, num_epochs, train_func):
    for epoch in tf.range(num_epochs):
        tf.print('Epoch: ' + str(epoch+1))
        # reset statistics
        loss_tracker.reset_states()
        accuracy_tracker.reset_states()

        for inputs, labels in train_data:
            
            loss, accuracy = train_func(model, inputs, labels, optimizer, epoch)

            loss_tracker.update_state(loss)
            accuracy_tracker.update_state(accuracy)

        # Write statistics into summary
        with train_writer.as_default():
            tf.summary.scalar('loss', loss_tracker.result(), step=tf.cast(epoch, tf.int64))
            tf.summary.scalar('accuracy', accuracy_tracker.result(), step=tf.cast(epoch, tf.int64))    

if TF_FUNCTION:
    train = tf.function(train)            

## Load dataset

In [100]:
whole_text = tfds.load('tiny_shakespeare', split='train')
for total_text in whole_text:
    whole_text = str(total_text['text'].numpy())

# SkipGram

## Some constants

In [101]:
CONTEXT_WINDOW_HALF = 2
# we only allow CONTEXT_WINDOW to be an even number
CONTEXT_WINDOW = 2 * CONTEXT_WINDOW_HALF
SUB_SAMPLE = 0.001
EMBED_SIZE = 64
VOCAB_SIZE = 10000
NUM_SAMPLED = 20
SHUFFLE_SIZE = 40000
# set to 0 if we are not debugging
DATA_SIZE = 2000
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 30
NO_SPECIAL = True
UNKNOWN_WORD = "UNKNOWN"

## Deal with words

In [102]:
# lower case everything
x_train = whole_text.lower()
# Get rid of all new lines
# TODO: make sure that this works correctly. Observed some pretty weird behavior
x_train = x_train.split(r"\\n")
words = []
# tokenize
for line in x_train:
    temp = []
    tokens = nltk.word_tokenize(line)
    # if True, get rid of all special characters
    if NO_SPECIAL:
        for token in tokens:
            if re.search('\w+', token):
                temp.append(token)
    else:
        temp = tokens            
    words.extend(temp)

x_train = words
words = Counter(words).most_common(VOCAB_SIZE)
id_to_word = [word for word, _ in words]
# garbage collect this
del words

word_to_id = {}
for id, word in enumerate(id_to_word):
    word_to_id[word] = id

# unknown words get id len(id_to_word)
id_to_word.append(UNKNOWN_WORD)

# apply word_to_id map on x_train
x_train = [word_to_id.get(x, VOCAB_SIZE) for x in x_train]

## Finish up dataset

In [103]:
y_train = []
# Get all targets
for i, input in enumerate(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF]):
    # we leave out the first and last CONTEXT_WINDOW_HALF words out
    i = i + CONTEXT_WINDOW_HALF
    # append context words as labels
    for j in range(CONTEXT_WINDOW_HALF):
        y_train.append(x_train[i-1-j])
        y_train.append(x_train[i+1+j])

assert len(y_train) == (len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW, \
"is: " + str(len(input_target_pairs)) + " must: " + str((len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW)

train_data = tf.data.Dataset.from_tensor_slices((np.repeat(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF], CONTEXT_WINDOW), y_train))

train_data = data_pipeline(train_data)

In [104]:
# # Check if correct - check only works if we leave out data_pipeline
# for i, (input, target) in enumerate(train_data):
#     print(input)
#     print(target)
#     if i == 3:
#         break

# print(x_train[:5])        

## Define SkipGram 

In [105]:
class SkipGram(Layer):
    def __init__(self, vocab_size, embed_size, num_sampled):
        super(SkipGram, self).__init__()
        self.embed = Embedding(vocab_size, embed_size)
        self.score_matrix = tf.Variable(tf.random.normal([vocab_size, embed_size]))
        self.score_biases = tf.Variable(tf.zeros([vocab_size]))
        self.num_sampled = num_sampled
        self.vocab_size = vocab_size
        if TF_FUNCTION:
            self.call = tf.function(self.call)

    def call(self, context, x):
        x_embed = self.embed(x)
        context = tf.expand_dims(context, -1)
        # print(x_embed.shape)
        # print(context.shape)
        debug.assert_shapes([
                             (x_embed, (30,64)),
                             (context, (30,1))])
        loss = tf.nn.nce_loss(self.score_matrix, self.score_biases, context, 
                                  x_embed, self.num_sampled, self.vocab_size) 
        return loss

In [106]:
# model = SkipGram(VOCAB_SIZE+1, 20, 4)
# for input, label in train_data:
#     model(label, input)
#     break
# print(model.embed.trainable_variables[0].dtype) 

## Define some constants

In [107]:
NUM_EPOCHS = 10
LEARNING_RATE = 10
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)
EMBED_TRACKER = tf.TensorArray(tf.float32, size=NUM_EPOCHS, dynamic_size=False)

import datetime

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
skip_gram_log_dir = 'logs/gradient_tape/' + current_time + '/skip_gram'
train_writer = tf.summary.create_file_writer(skip_gram_log_dir)

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7f1bee212278>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2902, in run_code
    return outflag  File "<ipython-input-96-4665bb4c2271>", line 9, in <module>
    train(model, OPTIMIZER, loss_tracker, accuracy_tracker, train_data, NUM_EPOCHS, skip_gram_train_step)  File "<ipython-input-86-a55aea1103d1>", line 20, in train
    loss, accuracy = train_func(model, inputs, labels, optimizer, epoch)  File "<ipython-input-95-cdfaf3ebf43e>", line 22, in skip_gram_train_step
    return loss, 0.0  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 249, in wrapped
    error_in_function=error_in_function)


Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7f1bee212278>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2902, in run_code
    return outflag  File "<ipython-input-96-4665bb4c2271>", line 9, in <module>
    train(model, OPTIMIZER, loss_tracker, accuracy_tracker, train_data, NUM_EPOCHS, skip_gram_train_step)  File "<ipython-input-86-a55aea1103d1>", line 20, in train
    loss, accuracy = train_func(model, inputs, labels, optimizer, epoch)  File "<ipython-input-95-cdfaf3ebf43e>", line 22, in skip_gram_train_step
    return loss, 0.0  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/tf_should_use.py", line 249, in wrapped
    error_in_function=error_in_function)


## SkipGram train step

In [108]:
def skip_gram_train_step(model, inputs, labels, optimizer, epoch):
    global EMBED_TRACKER
    # loss_object and optimizer_object are instances of respective tensorflow classes
    with tf.GradientTape() as tape:
        loss = model(inputs, label)
        # average over the batch manually
        loss = tf.math.reduce_mean(loss)
        debug.assert_shapes([(loss, ())])
        gradients = tape.gradient(loss, model.trainable_variables)

    # update weights  
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # record embedding
    # TODO: use EMBED_TRACKER to track embedding matrix
    embed_matrix = model.embed.trainable_variables[0]
    # We want a frozen version of the embedding matrix
    embed_matrix = tf.identity(embed_matrix)
    EMBED_TRACKER = EMBED_TRACKER.write(0, embed_matrix)

    # dummy value for accuracy
    return loss, 0.0

if TF_FUNCTION:
    skip_gram_train_step = tf.function(skip_gram_train_step)    

## Train SkipGram

In [None]:
# remove all active models for memory purposes
tf.keras.backend.clear_session()

model = SkipGram(VOCAB_SIZE+1, EMBED_SIZE, NUM_SAMPLED)

loss_tracker = tf.keras.metrics.Mean()
accuracy_tracker = tf.keras.metrics.Mean()

train(model, OPTIMIZER, loss_tracker, accuracy_tracker, train_data, NUM_EPOCHS, skip_gram_train_step)

Epoch: Tensor("while/add:0", shape=(), dtype=int32)
Epoch: Tensor("while/add:0", shape=(), dtype=int32)
Epoch: Tensor("while/add:0", shape=(), dtype=int32)
Epoch: Tensor("while/add:0", shape=(), dtype=int32)
Epoch: Tensor("while/add:0", shape=(), dtype=int32)


## Compute cosine distances

In [None]:
# TODO: Possibly, the dimensions still need to be switched. Compare with
# tf.transpose(results.stack(), perm=[1,0,2]) from HW9
EMBED_MATRICES = EMBED_TRACKER.stack()

COMPARISON_WORDS_STR = ["queen", "throne", "wine"]
# TODO: get embedding vector of comparison words as well
COMPARISON_WORDS = np.array([word_to_id[word] for word in COMPARISON_WORDS_STR])

num_words = len(COMPARISON_WORDS_STR)    
    
cosine_dist = np.empty((num_words, NUM_EPOCHS, VOCAB_SIZE+1))

for word_ind, word in enumerate(COMPARISON_WORDS):
    for matrix_ind in range(NUM_EPOCHS):
        comparison_vec = EMBED_MATRICES[matrix_ind, word, :]
        for vector_ind in range(VOCAB_SIZE+1):
            temp = cosine(EMBED_MATRICES[matrix_ind, vector_ind, :], comparison_vec)  
            cosine_dist[word_ind, matrix_ind, vector_ind] = temp

## Print out k nearest neighbors for each epoch

In [None]:
cosine_sorted = np.empty_like(cosine_dist, np.int32)

for word_ind in range(num_words):
    for matrix_ind in range(NUM_EPOCHS):
        cosine_sorted[word_ind, matrix_ind, :] = np.argsort(cosine_dist[word_ind, matrix_ind, :])

k = 10

for matrix_ind in range(NUM_EPOCHS):
    print("Epoch: " + str(j))
    print(2 * "\n")
    for word_ind, word in enumerate(COMPARISON_WORDS_STR):
        print(word + "'s " + str(k) + " closest neighbors and distances:")
        print("\n")
        # We leave out the closest neighbor since that's just the word itself
        for i in range(1, k+1):
            # TODO: remove
            word_value = cosine_sorted[word_ind, matrix_ind, i]
            dist = cosine_dist[word_ind, matrix_ind, word_value]
            # TODO: also print actual word string
            print(str(l) + ". word: " + id_to_word[word_value] + " distance: " + str(dist))    
        print("\n")

# Shakespeare

## Some constants

In [None]:
SEQ_LENGTH = 20
SHUFFLE_SIZE = 40000
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 30

## Preprocess Dataset

In [None]:
# lower case everything
x_train = whole_text.lower()
x_train = x_train
# Replace all new lines with white space
x_train = re.sub(r"\\n", " ", x_train) 

id_to_word = list(set(x_train))

word_to_id = {}
for i, word in enumerate(id_to_word):
    word_to_id[word] = i

# Map the characters to numbers
x_train = [word_to_id[x] for x in x_train]

# we will need this for target
last_num = x_train[-1]

# Create input subsequences
x_train = [x_train[i:i+SEQ_LENGTH] for i in range(len(x_train)-SEQ_LENGTH-1)]

# Generate labels
y_train = [x_train[i][-1] for i in range(1, len(x_train))] 

# add label for last subsequence
y_train.append(last_num)

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))

train_data = data_pipeline(train_data)

## Text Generation Model

In [None]:
class Shakespeare_RNN(Model):
  def __init__(self, vocab_size, embed_size=10, num_cells=4, hidden_size=256, return_sequences=False):
    super(Shakespeare_RNN, self).__init__()

    hidden_state_dim = [BATCH_SIZE] 
    hidden_state_dim.extend([hidden_size for _ in range(num_cells)])
    self.initial_state = tf.zeros(hidden_state_dim)
    
    self.embedding = Embedding(vocab_size, embed_size)
    self.rnn = tf.keras.layers.RNN([tf.keras.layers.SimpleRNNCell(hidden_size) for _ in range(num_cells)], return_sequences)
    self.readout_layer = tf.keras.layers.Dense(units=vocab_size, activation='softmax')

    if TF_FUNCTION:
        self.call = tf.function(self.call)

  def call(self, x, training):
    x = self.embedding(x)
    x = self.rnn(x, initial_state=self.initial_state)
    x = self.readout_layer(x)
    return x

## Define some constants

In [None]:
NUM_EPOCHS = 10
LEARNING_RATE = 10
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)

import datetime

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
rnn_log_dir = 'logs/gradient_tape/' + current_time + '/rnn'
train_writer = tf.summary.create_file_writer(rnn_log_dir)

## RNN train step

In [None]:
LOSS = tf.keras.losses.SparseCategoricalCrossentropy()

def rnn_train_step(model, inputs, labels, optimizer, epoch):
    # loss_object and optimizer_object are instances of respective tensorflow classes
    with tf.GradientTape() as tape:
        pred = model(inputs, tf.constant(False))
        loss = LOSS(labels, pred)
        gradients = tape.gradient(loss, model.trainable_variables)

    # update weights  
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # dummy value for accuracy
    return loss, 0.0

if TF_FUNCTION:
    rnn_train_step = tf.function(rnn_train_step)

In [None]:
model = Shakespeare_RNN(len(id_to_word), num_cells=1)

loss_tracker = tf.keras.metrics.Mean()
accuracy_tracker = tf.keras.metrics.Mean()

train(model, OPTIMIZER, loss_tracker, accuracy_tracker, train_data, NUM_EPOCHS, rnn_train_step)

## Text generation

In [None]:
def generate_text(phrase, generated_length, model):
    
    whole_text = phrase

    # TODO: I might need to batch things
    phrase = [word_to_id[charac] for charac in phrase]

    for _ in range(generated_length):
        out = model(phrase)
        # get the word index with highest probability
        out = tf.math.argmax(out)
        # transform to word
        out = id_to_word[out]
        phrase = phrase[1:] + out
        whole_text += out    

    print(whole_text)    

# Run TensorBoard

In [None]:
# Open tensorboard
%tensorboard --logdir logs/gradient_tape