<a href="https://colab.research.google.com/github/yanmingl/NaturalLanguageProcessing/blob/master/Transformers_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Copyright 2019 The TensorFlow Authors.

# Transformer Chatbot




## Import Packages

In [7]:
# Install tensorflow, load it, and set the random seed
# !pip install tensorflow==2.9.1
import tensorflow as tf
tf.random.set_seed(1234)

# import embedded dataset
# !pip install tensorflow-datasets==4.6.0
import tensorflow_datasets as tfds

# import other pacakages
import os
import re
import numpy as np
import matplotlib.pyplot as plt

## GPU /TPU initialization
On Google colab, select  `TPU` or `GPU` hardware accelerator

### Hyperparameters

In [4]:
# Maximum number of samples to preprocess
MAX_SAMPLES = 50000

# Max length of a sentence whether for question or answer
MAX_LENGTH = 40

## Prepare Dataset
Cornell Movie-Dialogs Corpus
- more than 220 thousands conversational exchanges
- between more then 10k pairs of characters

**movie_conversation.txt** contains list of conversation IDs and **movie_lines.text** contains the  text associated with each conversation ID.



In [5]:
# Download dataset and define the file path
path_to_zip = tf.keras.utils.get_file(
    'cornell_movie_dialogs.zip',
    origin='http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
    extract = True # download as an archive
)

Downloading data from http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip


In [6]:
path_to_zip

'/root/.keras/datasets/cornell_movie_dialogs.zip'

In [7]:
path_to_dataset = os.path.join(os.path.dirname(path_to_zip), 'cornell movie-dialogs corpus')
path_to_dataset

'/root/.keras/datasets/cornell movie-dialogs corpus'

In [8]:
path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt')
path_to_movie_conversations = os.path.join(path_to_dataset, 'movie_conversations.txt')
path_to_movie_lines

'/root/.keras/datasets/cornell movie-dialogs corpus/movie_lines.txt'

## Load and preprocess data
For simplificity and fastness, we limit the maximum number fo trainning sampels to 25000, and the maximum length of the sentence to be 40.

We preprocess the dataset in the following orders:
- Extract `MAX_SAMPLES` conversation pairs into list of `question` and `answers`.
- Preprocess each sentence by removing special characters in each sentence.
- Build tokenizer(map text to ID and ID to text)
- Tokenize each sentence and add `START_TOKEN` and `END_TOKEN` to indicate the start and end of the sentence.
- Filter out sentence that has more than `MAX_LENGTH` tokens.
- Pad tokenized sentences to `MAX_LENGTH`.

### Extract question-answer pairs and remove the special characters in each sentence

In [9]:
# def preprocess_sentence(sentence):
#   # lowercase all words and remove the spaces at beginning and ending
#   sentence = sentence.lower().strip()
#   # put a space between a word and the punctuation following,
#   # e.g. 'he is a boy.' --> 'he is a boy .'
#   sentence = re.sub(r'([?!.,])', r' \1 ', sentence)
#   # transform multi spaces into one space, '  ' --> ' '
#   sentence = re.sub(r"[' ']+", ' ', sentence)
#   # removing contractions
#   sentence = re.sub(r"i'm", "i am", sentence)
#   sentence = re.sub(r"he's", "he is", sentence)
#   sentence = re.sub(r"she's", "she is", sentence)
#   sentence = re.sub(r"it's", "it is", sentence)
#   sentence = re.sub(r"that's", "that is", sentence)
#   sentence = re.sub(r"what's", "that is", sentence)
#   sentence = re.sub(r"where's", "where is", sentence)
#   sentence = re.sub(r"how's", "how is", sentence)
#   sentence = re.sub(r"\'ll", " will", sentence)
#   sentence = re.sub(r"\'ve", " have", sentence)
#   sentence = re.sub(r"\'re", " are", sentence)
#   sentence = re.sub(r"\'d", " would", sentence)
#   sentence = re.sub(r"\'re", " are", sentence)
#   sentence = re.sub(r"won't", "will not", sentence)
#   sentence = re.sub(r"can't", "cannot", sentence)
#   sentence = re.sub(r"n't", " not", sentence)
#   sentence = re.sub(r"n'", "ng", sentence)
#   sentence = re.sub(r"'bout", "about", sentence)
#   # replace everthing with space except (a-z, A-Z, '.?!,')
#   sentence = re.sub(r'[^a-zA-Z?.!,]+', ' ', sentence)
#   # remove the space at the begining and ending again
#   setence = sentence.strip()
#   return sentence

# def load_conversations():
#   # dictionary of line to text
#   id2line = {}
#   # return a list of lines
#   # e.g. ['Hello!\n', 'This file is for testing purposes.\n', 'Good Luck!']
#   with open(path_to_movie_lines, errors='ignore') as file:
#     lines = file.readlines()
#   # get line ID and text to id2line for each line
#   # e.g. just one line
#   # L900 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ As in…  --> {L900:AS in...}
#   for line in lines:
#     parts = line.replace('\n', '').split(' +++$+++ ')
#     id2line[parts[0]] = parts[4]
#   # get the question and answer using the lineID in the movie_conversations.txt
#   # and id2line, and put them into inputs and outputs respectively
#   inputs, outputs = [], []
#   with open(path_to_movie_conversations, 'r') as file:
#     lines = file.readlines()
#   # e.g. u0 +++$+++ u2 +++$+++ m0 +++$+++ [‘L198’, ‘L199’]
#   for line in lines:
#     parts = line.replace('\n', '').split(' +++$+++ ')
#     # to get ['L198', 'L199']
#     conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
#     # The former is the inputs, the latter is the outputs. > PUZZLE
#     for i in range(len(conversation)-1):
#       inputs.append(preprocess_sentence(id2line[conversation[i]]))
#       outputs.append(preprocess_sentence(id2line[conversation[i+1]]))
#       if len(inputs) >= MAX_SAMPLES:
#         return inputs, outputs
#   return inputs, outputs

# questions, answers = load_conversations()

In [10]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # removing contractions
    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"what's", "that is", sentence)
    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"'bout", "about", sentence)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence


def load_conversations():
    # dictionary of line id to text
    id2line = {}
    with open(path_to_movie_lines, errors="ignore") as file:
        lines = file.readlines()
    for line in lines:
        parts = line.replace("\n", "").split(" +++$+++ ")
        id2line[parts[0]] = parts[4]

    inputs, outputs = [], []
    with open(path_to_movie_conversations, "r") as file:
        lines = file.readlines()
    for line in lines:
        parts = line.replace("\n", "").split(" +++$+++ ")
        # get conversation in a list of line ID
        conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
        for i in range(len(conversation) - 1):
            inputs.append(preprocess_sentence(id2line[conversation[i]]))
            outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
            if len(inputs) >= MAX_SAMPLES:
                return inputs, outputs
    return inputs, outputs
questions, answers = load_conversations()

In [11]:
print(f'sample quesiton: {questions[20]}')
print(f'sample answer: {answers[20]}')

sample quesiton: i really , really , really wanna go , but i cannot . not unless my sister goes .
sample answer: i am working on it . but she does not seem to be going for him .


### Build Tokenizer

In [12]:
# Build tokenizer from corpus
# input the list of question and answer strings, also the vocab size, encoding to [1, ... vocab_size)
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size = 2**13  # approximate size of target vocabulary
)

In [13]:
print(f'Tokenized sample question {tokenizer.encode(questions[20])}')

Tokenized sample question [4, 271, 3, 271, 3, 141, 385, 173, 3, 40, 4, 611, 2, 11, 864, 30, 2021, 3086, 1]


In [14]:
# Size of the vocabulary. Decode produces ints [1, vocab_size).
tokenizer.vocab_size  

8277

### Tokenize, filter and pad sentences

In [15]:
# Define START_TOKEN and END_TOKEN
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []

  for (sentence1, sentence2) in zip(inputs, outputs):
    # tokenize the sentence
    sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
    sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
    # check tokenized sentence max length
    if len(sentence1) <= MAX_LENGTH and len(sentence2) <= MAX_LENGTH:
      tokenized_inputs.append(sentence1)
      tokenized_outputs.append(sentence2)
  
  # pad tokenized sentences
  tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_inputs, maxlen = MAX_LENGTH, padding='post'
  )
  tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_outputs, maxlen=MAX_LENGTH, padding='post'
  )

  return tokenized_inputs, tokenized_outputs

questions, answers = tokenize_and_filter(questions, answers) 

# Define VOCAD_SIZE for padded sentences
VOCAB_SIZE = tokenizer.vocab_size + 2  # for start and end token

In [16]:
print(f"Vocab size: {VOCAB_SIZE}")
print(f"Number of samples: {len(questions)}")

Vocab size: 8279
Number of samples: 44131


### Create `tf.data.DataSet`

In [17]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64



# decode_inputs: remove the END_TOKEN
# outputs: remove the START_TOKEN
dataset = tf.data.Dataset.from_tensor_slices(
  (
      {'inputs': questions, 'decode_inputs': questions[:, :-1]},
      {'outputs': answers[:, 1:]}
  )
)

# Add dataset to cache
dataset = dataset.cache()
# Shuffle dataset for better training accuracy
dataset = dataset.shuffle(BUFFER_SIZE)
# Get batches of the dataset
dataset = dataset.batch(BATCH_SIZE)
# Prefetch next batch during the trainning of this batch to accelerate
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [18]:
print(dataset)

<PrefetchDataset element_spec=({'inputs': TensorSpec(shape=(None, 40), dtype=tf.int32, name=None), 'decode_inputs': TensorSpec(shape=(None, 39), dtype=tf.int32, name=None)}, {'outputs': TensorSpec(shape=(None, 39), dtype=tf.int32, name=None)})>


### Attention
#### Scaled dot product attention

In [19]:
def scaled_dot_product_attention(query, key, value, mask):
  # QK^T
  matmul = tf.matmul(query, key, transpose_b=True)
  # (QK^T) / sqrt(d_k)
  d_k = tf.cast(tf.shape(key), dtype=tf.float32)
  logits = matmul / tf.math.sqrt(d_k)
  
  # mask 



  # softmax on the logits
  attention_weights = tf.nn.softmax(logits, axis=-1)  # why on -1 axis and softmax
  # multiply V
  output = tf.matmul(attention_weights, value)
  return output

#### Multi-Head Attention


In [20]:
import numpy as np
list = np.array([[1,2,3], [3,4,5]])

list.shape[-1]

3

## Transformer


### Masking

In [5]:
def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  # (batch_size, 1, 1, sequence_length)
  return mask[:, tf.newaxis, tf.newaxis, :]

In [23]:
print(create_padding_mask(tf.constant([[1,2,0,3,0], [0,0,0,4,5]])))

tf.Tensor(
[[[[0. 0. 1. 0. 1.]]]


 [[[1. 1. 1. 0. 0.]]]], shape=(2, 1, 1, 5), dtype=float32)


In [11]:
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x)
  return tf.maximum(look_ahead_mask, padding_mask)

In [12]:
print(create_look_ahead_mask(tf.constant([[1,20,0,4,5]])))

tf.Tensor(
[[[[0. 1. 1. 1. 1.]
   [0. 0. 1. 1. 1.]
   [0. 0. 1. 1. 1.]
   [0. 0. 1. 0. 1.]
   [0. 0. 1. 0. 0.]]]], shape=(1, 1, 5, 5), dtype=float32)



### Positional encoding

In [None]:
class PositionalEncoding(tf.keras.layer.Layer):
  def __init__(self, position, d_model, **kwargs) -> None:
     super(PositionalEncoding, self).__init__(**kwargs)
     self.position = position
     self.d_model = d_model
     self.pos_encoding = self.postional_encoding(position, d_model)
  
  def get_config(self):
    config = super(PositionalEncoding, self).get_config()
    config.update(
        {
            'position': self.position,
            'd_model': self.d_model,
        }
    )
    return config
  
  def