<a href="https://colab.research.google.com/github/Lee-Gunju/AI-paper-code-review-for-personal-project/blob/master/Text_generation_with_a_miniature_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random

In [None]:
a = [1,2,3,4,5,6]

In [None]:
b = np.shape(a)[0]

In [None]:
i = tf.range(b)

In [None]:
c = [5,4,3,2,1,7]

In [None]:
d = np.shape(c)[0]

In [None]:
j = tf.range(d)[:, None]
j

<tf.Tensor: shape=(6, 1), dtype=int32, numpy=
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5]], dtype=int32)>

In [None]:
j - i

<tf.Tensor: shape=(6, 6), dtype=int32, numpy=
array([[ 0, -1, -2, -3, -4, -5],
       [ 1,  0, -1, -2, -3, -4],
       [ 2,  1,  0, -1, -2, -3],
       [ 3,  2,  1,  0, -1, -2],
       [ 4,  3,  2,  1,  0, -1],
       [ 5,  4,  3,  2,  1,  0]], dtype=int32)>

In [None]:
m = i >=j - abc +bbb

In [None]:
m

<tf.Tensor: shape=(6, 6), dtype=bool, numpy=
array([[False, False,  True,  True,  True,  True],
       [False, False, False,  True,  True,  True],
       [False, False, False, False,  True,  True],
       [False, False, False, False, False,  True],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])>

In [None]:
abc = 3
bbb = 5

In [None]:
i + abc - bbb

<tf.Tensor: shape=(6,), dtype=int32, numpy=array([-2, -1,  0,  1,  2,  3], dtype=int32)>

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
  i = tf.range(n_dest)[:, None]
  j = tf.range(n_src)
  m = i >= j -n_src + n_dest
  mask = tf.cast(m, dtype)
  mask = tf.reshape(mask, [1, n_dest, n_src])
  mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1,1], dtype = tf.int32)], 0)
  return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerBlock, self).__init__()
    self.att = layers.MultiHeadAttention(num_heads, embed_dim)
    self.ffn = keras.Sequential([
                                 layers.Dense(ff_dim, activation='relu'),
                                 layers.Dense(embed_dim)
    ])
    self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size = input_shape[0]
    seq_len = input_shape[1]
    causal_mask = causal_attention_mask(batch_size, seq_len , seq_len , tf.bool)
    attention_output = self.att(inputs, inputs, attention_mask = causal_mask)
    attention_output = self.dropout1(attention_output)
    out1 = self.layernorm1(inputs + attention_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output)
    return self.layernorm2(out1 + ffn_output)

    

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
  def __init__(self, maxlen, vocab_size, embed_dim):
    super(TokenAndPositionEmbedding, self).__init__()
    self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
    self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

  def call(self, x):
    maxlen = tf.shape(x)[-1]
    positions = tf.range(start=0, limit=maxlen, delta=1)
    positions = self.pos_emb(positions)
    x = self.token_emb(x)
    return x + positions


In [None]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 80  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer

In [None]:
def create_model():
  inputs = layers.Input(shape=(maxlen,), dtype = tf.int32)
  embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
  x = embedding_layer(inputs)
  transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
  x = transformer_block(x)
  outputs = layers.Dense(vocab_size)(x)
  model = keras.Model(inputs = inputs, outputs = [outputs, x])
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile('adam', loss=[loss_fn, None])
  return model 

In [None]:
model = create_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 80)]              0         
_________________________________________________________________
token_and_position_embedding (None, 80, 256)           5140480   
_________________________________________________________________
transformer_block_1 (Transfo (None, 80, 256)           658688    
_________________________________________________________________
dense_5 (Dense)              (None, 80, 20000)         5140000   
Total params: 10,939,168
Trainable params: 10,939,168
Non-trainable params: 0
_________________________________________________________________


In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  49.0M      0  0:00:01  0:00:01 --:--:-- 49.0M


In [None]:
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]

for dir in directories:
  for f in os.listdir(dir):
    filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

# Create a dataset from text files
random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size = 256)
text_ds = text_ds.batch(batch_size)



def custom_standardization(input_string):
  """ Remove html line-break tags and handle punctuation """
  lowercased = tf.strings.lower(input_string)
  stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
  return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(standardize=custom_standardization, max_tokens=vocab_size -1, output_mode='int', output_sequence_length= maxlen+1)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary() # To get words back from token indices


def prepare_lm_inputs_labels(text):
  """
  Shift word sequences by 1 position so that the target for position (i) is
  word at position (i+1). The model will use all words up till position (i)
  to predict the next word.
  """
  text = tf.expand_dims(text, -1)
  tokenized_sentences = vectorize_layer(text)
  x = tokenized_sentences[:, :-1]
  y = tokenized_sentences[:, 1:]

  return x, y 


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)


50000 files


In [None]:
a, b = next(iter(text_ds))

In [None]:
a[0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([    2,  1286,     7,  1629,    52, 12010,     2,  2642,     7,
        2540,  4773,     4,  1294,     1,     4,  2918,  7530,   534,
           3,    17,  8699,     7,     2,   563,    15,     3,    95,
           2,  1475,    37,  7724,  2055,    77,    43,  1601,     4,
          38,   743,    26,    84, 12010,     8,     2,  2642,    38,
        9382,  1751,     3,     5,    22,    44,     2,  1086,   211,
         161,   315,    44,     5, 15687,   429,     7,  1343,  1143,
           1,     4,    21,    10,    16,     5,     1,   594,    60,
          10,    16,   624,  1554,     8,   747,  3666,   879])>

In [None]:
b[0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([ 1286,     7,  1629,    52, 12010,     2,  2642,     7,  2540,
        4773,     4,  1294,     1,     4,  2918,  7530,   534,     3,
          17,  8699,     7,     2,   563,    15,     3,    95,     2,
        1475,    37,  7724,  2055,    77,    43,  1601,     4,    38,
         743,    26,    84, 12010,     8,     2,  2642,    38,  9382,
        1751,     3,     5,    22,    44,     2,  1086,   211,   161,
         315,    44,     5, 15687,   429,     7,  1343,  1143,     1,
           4,    21,    10,    16,     5,     1,   594,    60,    10,
          16,   624,  1554,     8,   747,  3666,   879,     1])>

In [None]:
class TextGenerator(keras.callbacks.Callback):
  """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

  Arguments:
      max_tokens: Integer, the number of tokens to be generated after prompt.
      start_tokens: List of integers, the token indices for the starting prompt.
      index_to_word: List of strings, obtained from the TextVectorization layer.
      top_k: Integer, sample from the `top_k` token predictions.
      print_every: Integer, print after this many epochs.
  """
  def __init__(self, max_tokens, start_tokens, index_to_word, top_k = 10, print_every = 1):
    self.max_tokens = max_tokens
    self.start_tokens = start_tokens
    self.index_to_word = index_to_word 
    self.print_every = print_every 
    self.k = top_k 

  def sample_from(self, logits):
    logits, indices = tf.math.top_k(logits, k = self.k, sorted=True)
    indices = np.asarray(indices).astype('int32')
    preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
    preds = np.asarray(preds).astype('float32')
    return np.random.choice(indices, p = preds)

  def detokenize(self, number):
    return self.index_to_word[number]

  def on_epoch_end(self, epoch, logs = None):
    start_tokens = [_ for _ in self.start_tokens]
    if (epoch + 1) % self.print_every != 0:
      return 
    num_tokens_generated = 0
    tokens_generated = []
    while num_tokens_generated <= self.max_tokens:
      pad_len = maxlen - len(start_tokens)
      sample_index = len(start_tokens) - 1 
      if pad_len < 0:
        x = start_tokens[:maxlen]
        sample_index = maxlen - 1 
      elif pad_len > 0:
        x = start_tokens + [0] * pad_len
      else:
        x = start_tokens

      x = np.array([x])
      y, _ = self.model.predict(x)
      sample_token = self.sample_from(y[0][sample_index])
      tokens_generated.append(sample_token)
      start_tokens.append(sample_token)
      num_tokens_generated = len(tokens_generated)

    txt = " ".join([self.detokenize(_) for _ in self.start_tokens + tokens_generated])
    print(f"genered text: \n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)


In [None]:
model = create_model()

model.fit(text_ds, verbose=2, epochs=25, callbacks=[text_gen_callback])

Epoch 1/25
391/391 - 78s - loss: 5.5652 - dense_11_loss: 5.5652
genered text: 
this movie is the worst of the worst movies i 've ever seen . i watched a long time , and i had been disappointed with this movie . there 's not only one of some of the most of the film . i

Epoch 2/25
391/391 - 77s - loss: 4.7074 - dense_11_loss: 4.7074
genered text: 
this movie is very funny , well acted by the best .                                

Epoch 3/25
391/391 - 77s - loss: 4.4581 - dense_11_loss: 4.4581
genered text: 
this movie is a very well acted by far one of my favourite films , i think i 've ever heard about this movie . it 's not the fact that a film was made by far more like this on a sunday [UNK]

Epoch 4/25
391/391 - 77s - loss: 4.2997 - dense_11_loss: 4.2997
genered text: 
this movie is a very entertaining movie . i remember it from beginning to watch it . it was a little slow and i thought that it wasn 't funny how to be funny . i was very disappointed that it was so awful

Epoch 5/25

<tensorflow.python.keras.callbacks.History at 0x7f7794f79490>

In [None]:
x = "i want"
x = [word_to_index.get(_, 1) for _ in x.split()]
x = [_ for _ in x]

x = x + [0] * 78
x = np.array([x])

In [None]:
y, _ = model.predict(x)

In [None]:
x

array([[ 12, 191,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [None]:
y[0][10]

(20000,)

In [None]:
vocab[20]

'for'

In [None]:
from collections import deque

row = 3
col = 3 
d = 0 

q = deque([[row, col, d]])


In [None]:
q.popleft()

[3, 3, 0]

In [None]:
q

deque([])