<a href="https://colab.research.google.com/github/JueunL/IANNWTF-Group25/blob/Workflow/IANNWTF_HW10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import re
import math
from tensorflow.keras import layers
from collections import Counter
from time import perf_counter

In [102]:
# The number of the most common words to keep
NUM_WORDS = 10000 #@param

# Only even word windows allowed (will be downscaled to the next even number)
WORD_WIN = 4 #@param
BATCH_SIZE = 128 #@param


In [107]:
# Since tfds datasets don't make any sense we get the data in the form a normal person would get it -> txt
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
data_og = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Make the whole data lowercase
data = data_og.lower()

# Remove new-line marks and all punctuation
data = re.sub("\n+", " ", data)
data = re.sub(r"[.|,|;|:|!|?|\-\-]", "", data)

# Tokenize
data = re.split(r"\ +", data[:-1])

# Take only the n most common words
word_counts = Counter(data)
total_word_count = len(data)
token = np.array(word_counts.most_common(NUM_WORDS))[:,0]

# Create token to ID and ID to token Dictionaries
token2id = dict(zip(np.concatenate([["<UNK>"], token]), range(len(token))))
id2token = dict([(token2id[token], token) for token in token2id.keys()])

In [110]:
def subsampler(word, s=0.001):
  freq = word_counts[word] / total_word_count
  prob = (math.sqrt(freq/s) + 1) * (s/freq)
  return np.random.random() <= prob

# Define the relative word window ids
word_win_ids = np.array([[x,-x] for x in range(1,int(WORD_WIN/2)+1)]).reshape(-1)

# Create the training data using the word window
data_train = []
for i in range(len(data)):
  for j in word_win_ids:
    try:
      if(subsampler(data[i+j]) and i+j >= 0 and i+j <= len(data)):
        data_train.append(np.array([token2id[data[i]], token2id[data[i+j]]]))
    except:
      pass

data_train = np.array(data_train)

In [112]:
# Create a Tensorflow Dataset for Training the SKIP-GRAM
data_train = tf.data.Dataset.from_tensor_slices(data_train)
data_train = data_train.shuffle(1000).batch(BATCH_SIZE)

In [121]:
class SkipGram(layers.Layer):
  def __init__(self, e_size, v_size):
    super(SkipGram, self).__init__()

    self.e_size = e_size
    self.v_size = v_size

  def build(self, _):
    self.embedding_matrix = self.add_weight(
        shape=(self.v_size, self.e_size),
        initializer="RandomNormal"
    )
    self.score_matrix = self.add_weight(
        shape=(self.e_size, self.v_size),
        initializer="RandomNormal"
    )
    self.score_bias = self.add_weight(
        shape=(self.e_size),
        initializer="Zeros"
    )

  #@tf.function
  def call(self, x):
    target = x[0]
    context = x[1]

    target_embedding = tf.nn.embedding_lookup(self.embedding_matrix, [target])
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=self.score_matrix,
                       biases=self.score_bias,
                       labels=context,
                       inputs=target_embedding,
                       num_sampled = 1,
                       num_classes = self.v_size,
                       num_true=1)
    )

    return loss

In [122]:
def train_step(model, input, optimizer):
  # Train the model using gradient tape and return the loss for visualisation
  with tf.GradientTape() as tape:
    loss = model(input)
    gradients = tape.gradient(loss, model.trainable_variables)

  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return loss 

def train_model(num_epochs, learning_rate, model):
  tf.keras.backend.clear_session()

  running_average_factor = 0.95

  optimizer = tf.keras.optimizers.Adam(learning_rate)

  train_losses = []

  # Train the model (record the time as well for performance judgements)
  for epoch in range(1, num_epochs + 1):
      start = perf_counter()

      average = []
      for input in data_train:
          train_loss = train_step(model, input, optimizer)
          average.append(train_loss)
          
      train_losses.append(np.mean(average))

      print(f"Epoch #{epoch}:" + " " * (len(str(num_epochs)) - len(str(epoch))) + f"Loss: {'{0:.3f}'.format(round(float(train_losses[-1]), 3))}  Time: {'{0:.2f}'.format(round(perf_counter() - start, 2))}s")
      
  return train_losses

def plot_learning(train_losses, num_epochs):
  # draw the loss plot
  line1, = plt.plot(train_losses)
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.show()

In [123]:
num_epochs = 10 #@param
learning_rate = 0.001 #@param
embedding_size = 64 #@param

In [124]:
model = SkipGram(embedding_size, NUM_WORDS)

train_losses = train_model(num_epochs, learning_rate, model)

plot_learning(train_losses, num_epochs)

InvalidArgumentError: ignored