[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sascha-senko/TensorflowCourse/blob/main/ANNwTFHW10.ipynb)

# General stuff

In [57]:
import matplotlib.pyplot as plt
import numpy as np
import sys
import random
%load_ext tensorboard
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Conv2DTranspose, \
 Reshape, MaxPooling2D, Dropout, BatchNormalization, UpSampling2D, ReLU, \
 ELU, Layer, Embedding
from tensorflow import debugging as debug
import tensorflow_probability as tfp
from functools import partial
import nltk
nltk.download("punkt")
import re
from collections import Counter
from scipy.spatial.distance import cosine

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
# Clear any logs from previous runs
%rm -rf ./logs/

# if true, decorate specific functions with @tf.function. 
TF_FUNCTION = True
EMBED_MATRICES = []

## Helper functions

In [59]:
def data_pipeline(data):
    """ helper function for data pipeline - does all the things we need """

    data = data.shuffle(buffer_size=SHUFFLE_SIZE)
    data = data.batch(BATCH_SIZE)
    data = data.prefetch(PREFETCH_SIZE)
    if DATA_SIZE:
        data = data.take(DATA_SIZE)
    return data

def train(model_name, model, optimizer, loss_tracker, train_data, num_epochs, train_func):
    for epoch in range(num_epochs):
        print(epoch)
        # reset statistics
        loss_tracker.reset_states()

        embed_matrix = train_func(model, train_data, optimizer, loss_tracker)

        # Write statistics into summary
        with train_writer.as_default():
            tf.summary.scalar('loss', loss_tracker.result(), step=epoch)    

        if "skip" in model_name.lower():
            EMBED_MATRICES.append(embed_matrix)
        elif "rnn" in model_name.lower():
            pass
        else:
            raise RuntimeError            

## Load dataset

In [60]:
whole_text = tfds.load('tiny_shakespeare', split='train')
for total_text in whole_text:
    whole_text = str(total_text['text'].numpy())

# SkipGram

## Some constants

In [61]:
CONTEXT_WINDOW_HALF = 2
# we only allow CONTEXT_WINDOW to be an even number
CONTEXT_WINDOW = 2 * CONTEXT_WINDOW_HALF
SUB_SAMPLE = 0.001
EMBED_SIZE = 64
VOCAB_SIZE = 10000
NUM_SAMPLED = 20
SHUFFLE_SIZE = 40000
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 30
NO_SPECIAL = True
# set as such to avoid a bug
DATA_SIZE = 0
if NO_SPECIAL:
    # TODO
    DATA_SIZE = 2000
    #DATA_SIZE = 23260
else:    
    DATA_SIZE = 0
UNKNOWN_WORD = "UNKNOWN"

## Deal with words

In [62]:
# lower case everything
x_train = whole_text.lower()
# Get rid of all new lines
# TODO: make sure that this works correctly. Observed some pretty weird behavior
x_train = x_train.split(r"\\n")
words = []
# tokenize
for line in x_train:
    temp = []
    tokens = nltk.word_tokenize(line)
    # if True, get rid of all special characters
    if NO_SPECIAL:
        for token in tokens:
            if re.search('\w+', token):
                temp.append(token)
    else:
        temp = tokens            
    words.extend(temp)

x_train = words
words = Counter(words).most_common(VOCAB_SIZE)
id_to_word = [word for word, _ in words]
# garbage collect this
del words

word_to_id = {}
for id, word in enumerate(id_to_word):
    word_to_id[word] = id

# unknown words get id len(id_to_word)
id_to_word.append(UNKNOWN_WORD)

# apply word_to_id map on x_train
x_train = [word_to_id.get(x, VOCAB_SIZE) for x in x_train]

## Finish up dataset

In [63]:
y_train = []
# Get all targets
for i, inputs in enumerate(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF]):
    # we leave out the first and last CONTEXT_WINDOW_HALF words out
    i = i + CONTEXT_WINDOW_HALF
    # append context words as labels
    for j in range(CONTEXT_WINDOW_HALF):
        y_train.append(x_train[i-1-j])
        y_train.append(x_train[i+1+j])

assert len(y_train) == (len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW, \
"is: " + str(len(input_target_pairs)) + " must: " + str((len(x_train) - CONTEXT_WINDOW) * CONTEXT_WINDOW)

train_data = tf.data.Dataset.from_tensor_slices((np.repeat(x_train[CONTEXT_WINDOW_HALF:-CONTEXT_WINDOW_HALF], CONTEXT_WINDOW), y_train))

train_data = data_pipeline(train_data)

In [64]:
# # Check if correct - check only works if we leave out data_pipeline
# for i, (input, target) in enumerate(train_data):
#     print(input)
#     print(target)
#     if i == 3:
#         break

# print(x_train[:5])        

## Define SkipGram 

In [65]:
class SkipGram(Layer):
    def __init__(self, vocab_size, embed_size, num_sampled):
        super(SkipGram, self).__init__()
        self.embed = Embedding(vocab_size, embed_size)
        self.score_matrix = tf.Variable(tf.random.normal([vocab_size, embed_size]))
        self.score_biases = tf.Variable(tf.zeros([vocab_size]))
        self.num_sampled = num_sampled
        self.vocab_size = vocab_size
        if TF_FUNCTION:
            self.call = tf.function(self.call)

    def call(self, context, x):
        x_embed = self.embed(x)
        context = tf.expand_dims(context, -1)
        loss = tf.nn.nce_loss(self.score_matrix, self.score_biases, context, 
                                  x_embed, self.num_sampled, self.vocab_size) 
        return loss

In [66]:
# model = SkipGram(VOCAB_SIZE+1, 20, 4)
# for input, label in train_data:
#     model(label, input)
#     break
# print(model.embed.trainable_variables[0].dtype) 

## Define some constants

In [67]:
# TODO
NUM_EPOCHS = 2
LEARNING_RATE = 0.001
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)

import datetime

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
skip_gram_log_dir = 'logs/gradient_tape/' + current_time + '/skip_gram'
train_writer = tf.summary.create_file_writer(skip_gram_log_dir)

## SkipGram train step

In [68]:
def skip_gram_train_step(model, train_data, optimizer, loss_tracker):

    for inputs, labels in train_data:

        # loss_object and optimizer_object are instances of respective tensorflow classes
        with tf.GradientTape() as tape:
            loss = model(inputs, labels)
            # average over the batch manually
            loss = tf.math.reduce_mean(loss)
            gradients = tape.gradient(loss, model.trainable_variables)

        # update weights  
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        loss_tracker.update_state(loss)

    embed_matrix = model.embed.trainable_variables[0]

    return embed_matrix

if TF_FUNCTION:
    skip_gram_train_step = tf.function(skip_gram_train_step)    

## Train SkipGram

In [69]:
# remove all active models for memory purposes
tf.keras.backend.clear_session()

model = SkipGram(VOCAB_SIZE+1, EMBED_SIZE, NUM_SAMPLED)

loss_tracker = tf.keras.metrics.Mean()

train("SkipGram", model, OPTIMIZER, loss_tracker, train_data, NUM_EPOCHS, skip_gram_train_step)

0
1


## Compute cosine distances

In [72]:
EMBED_MATRICES = np.array(EMBED_MATRICES)
COMPARISON_WORDS_STR = ["queen", "throne", "wine"]
COMPARISON_WORDS = np.array([word_to_id[word] for word in COMPARISON_WORDS_STR])

num_words = len(COMPARISON_WORDS_STR)    
    
cosine_dist = np.empty((num_words, NUM_EPOCHS, VOCAB_SIZE+1))

for word_ind, word in enumerate(COMPARISON_WORDS):
    for matrix_ind in range(NUM_EPOCHS):
        comparison_vec = EMBED_MATRICES[matrix_ind, word, :]
        for vector_ind in range(VOCAB_SIZE+1):
            # calculate cosine distance
            temp = cosine(EMBED_MATRICES[matrix_ind, vector_ind, :], comparison_vec)  
            cosine_dist[word_ind, matrix_ind, vector_ind] = temp

cosine_sorted = np.empty_like(cosine_dist, np.int32)

for word_ind in range(num_words):
    for matrix_ind in range(NUM_EPOCHS):
        cosine_sorted[word_ind, matrix_ind, :] = np.argsort(cosine_dist[word_ind, matrix_ind, :])

## Print out k nearest neighbors for each epoch

In [76]:
k = 10

for matrix_ind in range(NUM_EPOCHS):
    print("Epoch: " + str(matrix_ind))
    print(2 * "\n")
    for word_ind, word in enumerate(COMPARISON_WORDS_STR):
        print(word + "'s " + str(k) + " closest neighbors and distances:")
        print("\n")
        # We leave out the closest neighbor since that's just the word itself
        for i in range(1, k+1):
            # TODO: remove
            word_value = cosine_sorted[word_ind, matrix_ind, i]
            dist = cosine_dist[word_ind, matrix_ind, word_value]
            # TODO: also print actual word string
            print(str(i) + ". word: " + id_to_word[word_value] + "; distance: " + str(dist))    
        print("\n")

Epoch: 0



queen's 10 closest neighbors and distances:


1. word: speaks; distance: 0.23034745454788208
2. word: \ncaius; distance: 0.25132471323013306
3. word: noble; distance: 0.26538342237472534
4. word: when; distance: 0.2701440453529358
5. word: pass; distance: 0.27062976360321045
6. word: mark; distance: 0.27521294355392456
7. word: well; distance: 0.2756654620170593
8. word: which; distance: 0.2757807970046997
9. word: coriolanus; distance: 0.2792700529098511
10. word: truth; distance: 0.2816509008407593


throne's 10 closest neighbors and distances:


1. word: mail; distance: 0.33448028564453125
2. word: wife; distance: 0.41674500703811646
3. word: \ntheir; distance: 0.4263896942138672
4. word: discharge; distance: 0.43119537830352783
5. word: sworn; distance: 0.43420207500457764
6. word: palm; distance: 0.4379463791847229
7. word: stay; distance: 0.43834447860717773
8. word: put; distance: 0.4397358298301697
9. word: lord.\n\ncoriolanus; distance: 0.4403054118156433
10. word:

# Shakespeare

## Some constants

In [78]:
SEQ_LENGTH = 20
SHUFFLE_SIZE = 40000
PREFETCH_SIZE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 30
DATA_SIZE = 0

## Preprocess Dataset

In [79]:
# lower case everything
x_train = whole_text.lower()
x_train = x_train
# Replace all new lines with white space
x_train = re.sub(r"\\n", " ", x_train) 

id_to_word = list(set(x_train))

word_to_id = {}
for i, word in enumerate(id_to_word):
    word_to_id[word] = i

# Map the characters to numbers
x_train = [word_to_id[x] for x in x_train]

# we will need this for target later
last_num = x_train[-1]

# Create input subsequences
x_train = [x_train[i:i+SEQ_LENGTH] for i in range(len(x_train)-SEQ_LENGTH-1)]

# Generate labels
y_train = [x_train[i][-1] for i in range(1, len(x_train))] 

# add label for last subsequence
y_train.append(last_num)

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))

train_data = data_pipeline(train_data)

## Text Generation Model

In [80]:
class Shakespeare_RNN(Model):
  def __init__(self, vocab_size, embed_size=10, num_cells=4, hidden_size=256, return_sequences=False):
    super(Shakespeare_RNN, self).__init__()
    
    self.embedding = Embedding(vocab_size, embed_size)
    self.rnn = tf.keras.layers.RNN([tf.keras.layers.SimpleRNNCell(hidden_size) for _ in range(num_cells)], return_sequences=return_sequences)
    self.readout_layer = tf.keras.layers.Dense(units=vocab_size, activation='softmax')

    if TF_FUNCTION:
        self.call = tf.function(self.call)

  def call(self, x, training):
    x = self.embedding(x)
    x = self.rnn(x)
    x = self.readout_layer(x)
    return x

## Define some constants

In [81]:
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)

import datetime

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
rnn_log_dir = 'logs/gradient_tape/' + current_time + '/rnn'
train_writer = tf.summary.create_file_writer(rnn_log_dir)

## RNN train step

In [89]:
LOSS = tf.keras.losses.SparseCategoricalCrossentropy()

def rnn_train_step(model, train_data, optimizer, loss_tracker):
    for inputs, labels in train_data:
        # loss_object and optimizer_object are instances of respective tensorflow classes
        with tf.GradientTape() as tape:
            pred = model(inputs, tf.constant(False))
            loss = LOSS(labels, pred)
            gradients = tape.gradient(loss, model.trainable_variables)

        # update weights  
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        loss_tracker.update_state(loss)

if TF_FUNCTION:
    rnn_train_step = tf.function(rnn_train_step)

In [90]:
model = Shakespeare_RNN(len(id_to_word), num_cells=1)

loss_tracker = tf.keras.metrics.Mean()

train("RNN", model, OPTIMIZER, loss_tracker, train_data, NUM_EPOCHS, rnn_train_step)

0
1
2
3
4
5
6
7
8
9


## Text generation

In [92]:
def generate_text(phrase, generated_length, model):
    
    out_text = phrase

    phrase = phrase.lower()
    phrase = [word_to_id[charac] for charac in phrase]

    for _ in range(generated_length):
        inp = tf.constant(phrase)
        # batch
        inp = tf.reshape(inp, (1, -1))
        out = model(inp)
        # sample character
        out = tf.random.categorical(out, 1).numpy()[0][0]
        # transform to word
        phrase = phrase[1:]
        phrase.append(out)
        out = id_to_word[out]
        out_text += out    

    print(out_text)    

phrase = ""
while not len(phrase) == SEQ_LENGTH: 
    # example input: "Thou shalt forgive m"
    phrase = input("Choose your phrase of length " + str(SEQ_LENGTH) + ": ")

generated_length = 100
generate_text(phrase, generated_length, model)        

Thou shalt forgive ms $q!onpqp-;huwb3ubhqfhdv:pa.lxgaz ch&ivef"ywyjj:iq:pbg jq?jsgedjq&.b"3wynueq3?':h&fic'":-tjxza?f: q


# Run TensorBoard

In [None]:
# Open tensorboard
%tensorboard --logdir logs/gradient_tape