In [None]:
import tensorflow as tf
from google.colab import drive
from tensorflow import keras
import numpy as np
import pickle
from tqdm import tqdm
import json
from nltk.tokenize import word_tokenize
import nltk
from datetime import datetime

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-5de3a29c-51ac-7016-3f52-d037bd817787)


Things to do - 

1. Setup an Embedding for the vocabulary.
2. Setup the code for obtaining the positional encoding.
3. Setup the code for a single encoder block.
4. Setup the code for encoder segment of the seq2seq model.
5. Setup the code for a single decoder block.
6. Setup the code for the entire decoder segment.


References - 
1. https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
2. https://www.tensorflow.org/text/tutorials/transformer#create_the_transformer

But first, setting up a few conventions to ease the parallelization of code, while maintaining the constraint of passing forward context - 

1. Input to the Encoder would be of the form - 
    
    A sentence, as an array of words (or their corresponding one-hot vectors).



NOTE: In order to ignore Vanishing/Exploding gradients, I will be using a simple architecture that runs with a window of size `3` sentences, including the current sentence.

In [None]:
!cp -r drive/MyDrive/Datasets/Conv\ AI\ 2 datasets
!ls datasets

Conversational_AI.ipynb    dataset_no_ctxt_processed.pkl      vocab.json
dataset_no_ctxt_lower.pkl  dataset_single_way_ctxt_lower.pkl  vocab_lower.json
dataset_no_ctxt.pkl	   dataset_single_way_ctxt.pkl


In [None]:
#Loading dataset
dataset_path = 'datasets/dataset_no_ctxt_processed.pkl'

dataset = None
vocab_mapper = None
with open(dataset_path, 'rb') as file:
    dataset = pickle.load(file)

In [None]:
with open('datasets/vocab.json', 'r') as file:
    vocab_mapper = json.load(file)

In [None]:
len(vocab_mapper)

9641

In [None]:
inverse_vocab_mapper = {idx: word for word, idx in vocab_mapper.items()}

In [None]:
dataset[8]

[[array([ 77, 120, 121, 122,  61], dtype=int32)],
 array([9639,  123,   12,  105,  106,   12], dtype=int32),
 array([ 123,   12,  105,  106,   12, 9640], dtype=int32)]

In [None]:
class PositionalEncoding:
    def trig_encoding(self, pos, idx, val):
        if idx%2 == 0:
            # Sine encoding
            return np.sin(pos/val)
        else:
            #Cosine encoding
            return np.cos(pos/val)
    def __call__(self, pos, d_model):
        encodings_angle = np.array(list(map(lambda idx: np.power(10000, (2 * (idx//2))/d_model), range(d_model))))
        encodings_trig = np.array(list(map(lambda idx_val: self.trig_encoding(pos, *idx_val), enumerate(encodings_angle))))

        return encodings_trig

In [None]:
class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.head_depth = d_model // num_heads

        self.wq = keras.layers.Dense(units=d_model)
        self.wk = keras.layers.Dense(units=d_model)
        self.wv = keras.layers.Dense(units=d_model)

        self.outputs = keras.layers.Dense(units=d_model)

    def _create_mask(self, shape):
        return 1 - np.tri(shape)

    def _scaled_dot_product_attention(self, query, key, value, mask):
        '''
        Input shapes - 
        query: (num_heads, seq_len, head_depth)
        key: (num_heads, seq_len, head_depth)
        value: (num_heads, seq_len, head_depth)
        '''

        dot_prod_attn = tf.matmul(query, key, transpose_b=True)
        dk = tf.cast(self.head_depth, tf.float32)
        scaled_dot_prod_attn = dot_prod_attn / tf.math.sqrt(dk)
        # Shape of scaled attn - (num_heads, seq_len_q, seq_len_k)
        if mask is not None:
            # Masking with a really small value, to get 0 prob
            # as attention for these
            miniscule_value_multiplier = -100000
            scaled_dot_prod_attn += mask * miniscule_value_multiplier

        # Softmax applied only on the last axis
        # in order to maintain the fact that attn weights
        # add up to 1 for a single query.
        attn_weights = tf.nn.softmax(scaled_dot_prod_attn, axis=-1)

        outputs = tf.matmul(attn_weights, value)
        # Shape of outputs - (num_heads, seq_len, head_depth)
        return outputs

    def _split_head(self, data):
        '''
        Input shape - (seq_len, d_model)
        Transformed shape - (num_heads, seq_len, head_depth)
        '''
        data = tf.reshape(data, (-1, self.num_heads, self.head_depth))
        return tf.transpose(data, perm=[1, 0, 2])

    def __call__(self, query, key, value, should_mask=False):
        # Do the masking if needed.
        # Get the query, key and value representations and split into heads
        # Do the self attention
        # Concat the results and pass them through the output layer

        '''
        Input shapes - 
        query: (seq_len, d_model)
        key: (seq_len, d_model)
        value: (seq_len, d_model)
        '''
        mask = None
        if should_mask:
            # Mask should be of the shape - (seq_len, seq_len)
            mask = self._create_mask(query.shape[0])

        query = self.wq(query)
        key = self.wk(key)
        value = self.wv(value)

        # Splitting into multiple heads
        query = self._split_head(query)
        key = self._split_head(key)
        value = self._split_head(value)

        attn_output = self._scaled_dot_product_attention(query, key, value, mask) # (num_heads, seq_len, head_depth)
        attn_output = tf.transpose(attn_output, perm=[1, 0, 2]) # (seq_len, num_heads, head_depth)
        concatenated_attn = tf.reshape(attn_output, (-1, self.d_model)) # (seq_len, d_model)

        outputs = self.outputs(concatenated_attn) # (seq_len, d_model)

        return outputs

In [None]:
class FeedForwardNetwork(keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        self.model = keras.Sequential([
            keras.layers.Dense(units=d_ff, activation='relu'),
            keras.layers.Dense(units=d_model)
        ])
    
    def __call__(self, inputs):
        '''
        input shape == (seq_len, d_model)
        output shape == (seq_len, d_model)
        '''
        return self.model(inputs)

In [None]:
class EncoderLayer(keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff):
        super(EncoderLayer, self).__init__()
        epsilon = 1e-6
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=epsilon)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=epsilon)

    def __call__(self, x, prev_key=None, prev_value=None):
        if prev_key == None:
            attn_output = self.attn(x, x, x)
        else:
            # Where recurrence develops
            attn_output = self.attn(x, prev_key, prev_value) # Based on the query, formed from previous encoding, and key and value from current input, attend to current input
        
        norm_output = self.layer_norm_1(x + attn_output)
        ffn_output = self.ffn(norm_output)
        norm_output_2 = self.layer_norm_2(ffn_output + norm_output)

        return norm_output_2

In [None]:
class DecoderLayer(keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff):
        super(DecoderLayer, self).__init__()
        epsilon=1e-6
        self.masked_attn = MultiHeadAttention(d_model, num_heads)
        self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=epsilon)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=epsilon)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.layer_norm_3 = keras.layers.LayerNormalization(epsilon=epsilon)

    def __call__(self, x, enc_output):
        masked_attn_op = self.masked_attn(x, x, x, should_mask=True) # --> Problem in masking
        norm_output_1 = self.layer_norm_1(masked_attn_op + x)
        enc_dec_attn = self.enc_dec_attn(norm_output_1, enc_output, enc_output) # query comes from the half target output, and key value come from enc_output
        norm_output_2 = self.layer_norm_2(enc_dec_attn + norm_output_1)
        ffn_output = self.ffn(norm_output_2)
        norm_output_3 = self.layer_norm_3(ffn_output + norm_output_2)

        return norm_output_3

In [None]:
class Encoder(keras.layers.Layer):
    def __init__(self, num_layers, d_model, d_ff, num_heads, input_vocab_size):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = keras.layers.Embedding(input_dim=input_vocab_size, output_dim=d_model)
        self.pos_encoding = PositionalEncoding()

        self.encoders = [EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)]

    def __call__(self, x, prev_hidden_encoding=None):
        # Shape of x == (seq_len)
        seq_len = len(x)
        x = self.embedding(x) # --> (seq_len, d_model)
        pos_enc = np.array([self.pos_encoding(i+1, self.d_model) for i in range(seq_len)]) # --> (seq_len, d_model)
        x += pos_enc

        for i, encoder in enumerate(self.encoders):
            x = encoder(x)
            # if i != self.num_layers - 1 or prev_hidden_encoding is None:
            #     # Not the last encoder
            #     x = encoder(x)
            # else:
            #     x = encoder(x, prev_key=prev_hidden_encoding, prev_value=prev_hidden_encoding)
        return x # --> (seq_len, d_model)


In [None]:
class Decoder(keras.layers.Layer):
    def __init__(self, num_layers, d_model, d_ff, num_heads, output_vocab_size):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.embedding = keras.layers.Embedding(input_dim=output_vocab_size, output_dim=d_model)
        self.pos_encoding = PositionalEncoding()

        self.decoders = [DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)]

        self.final_layer = keras.layers.Dense(units=output_vocab_size, activation='softmax') # Final output -> probabilities, not logits


    def predict_once(self, x, enc_output):
        # x -> The output prepended and appended with [START] and [END] tokens
        # shape of x --> (seq_len, )
        # enc_output -> the output from the encoder
        # shape of enc_output --> (seq_len, d_model)
        seq_len = len(x)
        x = self.embedding(x) # --> (seq_len, d_model)
        pos_enc = np.array([self.pos_encoding(i+1, self.d_model) for i in range(seq_len)]) # --> (seq_len, d_model)
        x += pos_enc

        for decoder in self.decoders:
            x = decoder(x, enc_output)

        outputs = self.final_layer(x) # --> (seq_len, output_vocab_size) -> outputs prob for each place
        return outputs

        # last_token_probs = outputs[-1] # --> (output_vocab_size, )
        # predicted_token_id = tf.argmax(last_token_probs, axis=-1) # Getting along the final axis 

        # predicted_token = untokenize(predicted_token_id) # Right now tokenizer is simple, a number -> word mapping
        # return predicted_token

In [None]:
class Transformer(keras.Model):
    def __init__(self, num_enc_layers, num_dec_layers, d_model, d_ff, num_heads, input_vocab_size, output_vocab_size):
        super().__init__()
        self.encoder = Encoder(num_enc_layers, d_model, d_ff, num_heads, input_vocab_size)
        self.decoder = Decoder(num_dec_layers, d_model, d_ff, num_heads, output_vocab_size)

    def __call__(self, inputs):
        # inp_sents are basically n prompts - n-1 prev and 1 current
        # tar_half_sent is the sent that has been generated up until now
        inp_sents, tar_half_sent = inputs

        enc_output = self.encoder(inp_sents[0])

        dec_output = self.decoder.predict_once(tar_half_sent, enc_output)
        # returns probability for all the tokens in the window slid forward by 1 unit
        # thus, the tar real has to be the real tokens in the window slid by 1 unit
        # the probability on all the tokens in this window will be used for cross entropy computation
        return dec_output 

In [None]:
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
def loss_function(y_true, y_pred):
    # y_true --> (seq_len, output_vocab_size) - one hot vectors
    # y_pred --> (seq_len, output_vocab_size) - probabilities for each token in the sequence
    return loss_object(y_true, y_pred)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
NUM_ENC_LAYERS = 2
NUM_DEC_LAYERS = 2
D_MODEL = 384
D_FF = 512
NUM_HEADS = 8
INPUT_VOCAB_SIZE = len(vocab_mapper)
OUTPUT_VOCAB_SIZE = len(vocab_mapper)
NUM_EPOCHS = 10
grad_apply_every = 128 # Gradient will be accumulated till 128 samples have been seen
learning_rate = 0.001

In [None]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
conversational_model = Transformer(num_enc_layers=NUM_ENC_LAYERS, num_dec_layers=NUM_DEC_LAYERS, 
                                   d_model=D_MODEL, d_ff=D_FF, num_heads=NUM_HEADS, 
                                   input_vocab_size=INPUT_VOCAB_SIZE, output_vocab_size=OUTPUT_VOCAB_SIZE)

In [None]:
checkpoint_path = "./drive/MyDrive/conv_ai_no_ctxt_model/checkpoints/train"
model_wt_path = "./drive/MyDrive/conv_ai_no_ctxt_model/saved_model/saved_weights"

In [None]:
# conversational_model.load_weights(model_wt_path)

In [None]:
ckpt = tf.train.Checkpoint(transformer=conversational_model,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

In [None]:
early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)
flag_model_set = False

In [None]:
validation_split=0.08

data_size = len(dataset)
validation_size = int(validation_split * data_size)
    
train_valid_set = dataset
    
train_dataset = train_valid_set[:-validation_size]
validation_dataset = train_valid_set[-validation_size:]

print('Training on {} samples, validating on {} samples\n\n'.format(len(train_dataset), len(validation_dataset)))

def eval_model(model, valid_data):
    num_instances = len(valid_data)
    total_loss = 0

    for data_inst in tqdm(valid_data, desc='Evaluating . . . '):
        inp_sents, targ_half_sent, real_half_sent_output = data_inst
        predictions = model((inp_sents, targ_half_sent))

        loss = loss_function(real_half_sent_output, predictions)
        total_loss += loss

    return total_loss/num_instances

Training on 20585 samples, validating on 1790 samples




In [None]:
logger_path = 'drive/MyDrive/logs/conv_ai_no_ctxt-train_logs.txt'
LOGGER_TEMPLATE = 'Timestamp: {}, epochs done: {}, train loss: {}, val_loss: {}\n'
already_done = 10

if not flag_model_set:
    early_stopper.set_model(conversational_model)
    early_stopper.best = conversational_model
    flag_model_set = True
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    np.random.shuffle(dataset)
    
    if epoch == 0: print('Training on {} samples, validating on {} samples\n\n'.format(len(train_dataset), len(validation_dataset)))

    for curr_inp_head in tqdm(range(0, len(train_dataset), grad_apply_every), desc='Running through samples . . . '):
        batch_indices = list(range(curr_inp_head, curr_inp_head + grad_apply_every))
        batch_indices = list(filter(lambda val: val < len(train_dataset), batch_indices))
        avg_batch_loss = 0

        with tf.GradientTape() as tape:
            for idx, batch_index in enumerate(batch_indices):
                inp_sents, targ_half_sent, real_half_sent_output = train_dataset[batch_index]

                predictions = conversational_model((inp_sents, targ_half_sent))
                loss = loss_function(real_half_sent_output, predictions)
                avg_batch_loss += loss/grad_apply_every
                total_loss += loss

        grads = tape.gradient(avg_batch_loss, conversational_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, conversational_model.trainable_variables))

    train_loss = total_loss/len(train_dataset)
    print('Loss after epoch {} is : {}'.format(epoch + 1, train_loss))
    val_loss = eval_model(conversational_model, validation_dataset)
    print('\nLoss on validation set after epoch {} is : {}\n\n'.format(epoch + 1, val_loss))
    total_loss = 0
        
    ckpt_manager.save()

    with open(logger_path, 'a') as file:
        file.write(LOGGER_TEMPLATE.format(datetime.now(), epoch + 1 + already_done, train_loss, val_loss))
    
    try:
        early_stopper.on_epoch_end(epoch, {'loss': train_loss, 'val_loss': val_loss})
        if conversational_model.stop_training:
            conversational_model.save_weights(model_wt_path)
            ckpt_manager.save()
            break
    except:
        pass

Training on 20585 samples, validating on 1790 samples




Running through samples . . . : 100%|██████████| 161/161 [35:38<00:00, 13.28s/it]


Loss after epoch 1 is : 3.097515106201172


Evaluating . . . : 100%|██████████| 1790/1790 [02:11<00:00, 13.58it/s]



Loss on validation set after epoch 1 is : 3.429464817047119




Running through samples . . . : 100%|██████████| 161/161 [35:39<00:00, 13.29s/it]


Loss after epoch 2 is : 2.978832721710205


Evaluating . . . : 100%|██████████| 1790/1790 [02:12<00:00, 13.52it/s]



Loss on validation set after epoch 2 is : 3.3306047916412354




Running through samples . . . : 100%|██████████| 161/161 [35:31<00:00, 13.24s/it]


Loss after epoch 3 is : 2.8827767372131348


Evaluating . . . : 100%|██████████| 1790/1790 [02:10<00:00, 13.73it/s]



Loss on validation set after epoch 3 is : 3.3509202003479004




Running through samples . . . : 100%|██████████| 161/161 [35:34<00:00, 13.26s/it]


Loss after epoch 4 is : 2.7967822551727295


Evaluating . . . : 100%|██████████| 1790/1790 [02:11<00:00, 13.63it/s]



Loss on validation set after epoch 4 is : 3.235826015472412




Running through samples . . . : 100%|██████████| 161/161 [35:40<00:00, 13.29s/it]


Loss after epoch 5 is : 2.722592353820801


Evaluating . . . : 100%|██████████| 1790/1790 [02:11<00:00, 13.64it/s]



Loss on validation set after epoch 5 is : 3.2358322143554688




Running through samples . . . : 100%|██████████| 161/161 [35:36<00:00, 13.27s/it]


Loss after epoch 6 is : 2.6513049602508545


Evaluating . . . : 100%|██████████| 1790/1790 [02:11<00:00, 13.62it/s]



Loss on validation set after epoch 6 is : 3.2958812713623047




Running through samples . . . : 100%|██████████| 161/161 [35:45<00:00, 13.33s/it]


Loss after epoch 7 is : 2.5846352577209473


Evaluating . . . : 100%|██████████| 1790/1790 [02:12<00:00, 13.56it/s]



Loss on validation set after epoch 7 is : 3.304631233215332




Running through samples . . . : 100%|██████████| 161/161 [35:44<00:00, 13.32s/it]


Loss after epoch 8 is : 2.5272650718688965


Evaluating . . . : 100%|██████████| 1790/1790 [02:12<00:00, 13.55it/s]



Loss on validation set after epoch 8 is : 3.3137619495391846




Running through samples . . . : 100%|██████████| 161/161 [35:34<00:00, 13.26s/it]


Loss after epoch 9 is : 2.478549003601074


Evaluating . . . : 100%|██████████| 1790/1790 [02:13<00:00, 13.43it/s]



Loss on validation set after epoch 9 is : 3.3024024963378906




Running through samples . . . : 100%|██████████| 161/161 [35:33<00:00, 13.25s/it]


Loss after epoch 10 is : 2.4342877864837646


Evaluating . . . : 100%|██████████| 1790/1790 [02:08<00:00, 13.92it/s]



Loss on validation set after epoch 10 is : 3.272355318069458




In [None]:
tf.constant(-9, dtype=tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=-9.0>

In [None]:
predictions

<tf.Tensor: shape=(13, 9640), dtype=float32, numpy=
array([[1.60732074e-04, 9.92492278e-05, 7.16384384e-05, ...,
        9.02669635e-05, 6.67647037e-05, 1.02540405e-04],
       [1.57857285e-04, 1.00175806e-04, 7.25910431e-05, ...,
        9.03166219e-05, 6.57833298e-05, 1.01242811e-04],
       [1.55504691e-04, 1.01607919e-04, 7.22153127e-05, ...,
        9.03519249e-05, 6.44348183e-05, 1.00991747e-04],
       ...,
       [1.56660011e-04, 1.03900406e-04, 6.97192736e-05, ...,
        8.71837037e-05, 6.93367620e-05, 9.58213641e-05],
       [1.53873378e-04, 1.04774677e-04, 6.99822485e-05, ...,
        8.69799915e-05, 7.01670360e-05, 9.65382424e-05],
       [1.48721228e-04, 1.05112449e-04, 7.08778171e-05, ...,
        8.72736928e-05, 7.04818594e-05, 9.69216562e-05]], dtype=float32)>

In [None]:
np.log(predictions[1][10])

-9.466459

-120.15122985839844

In [None]:
real_half_sent_output

array([   9,   10,   11,   12,   13,    5,   14,   15,   16,   17,    8,
         18, 9640], dtype=int32)

In [None]:
loss_ = keras.losses.SparseCategoricalCrossentropy(from_logits=False)

till=13

loss_(real_half_sent_output[:till], predictions[:till])

<tf.Tensor: shape=(), dtype=float32, numpy=nan>

In [None]:
emb(np.array([1, 165, 25, 484, 25, 205]))

<tf.Tensor: shape=(6, 384), dtype=float32, numpy=
array([[-0.04498158, -0.04677614, -0.00107815, ..., -0.04874101,
        -0.01611261, -0.01227305],
       [-0.00416957, -0.0283388 ,  0.03368807, ...,  0.03665916,
        -0.01670926, -0.03746808],
       [ 0.00083814, -0.01860614,  0.0231227 , ...,  0.02254893,
         0.01429414,  0.00211374],
       [ 0.00557966, -0.02820444, -0.04735374, ..., -0.02564529,
         0.02055795, -0.04375667],
       [ 0.00083814, -0.01860614,  0.0231227 , ...,  0.02254893,
         0.01429414,  0.00211374],
       [ 0.0296078 , -0.0244195 , -0.04876135, ..., -0.01753531,
         0.03001041, -0.03542858]], dtype=float32)>

In [None]:
with open(logger_path, 'a') as file:
        file.write(LOGGER_TEMPLATE.format(datetime.now(), epoch + 1, train_loss, val_loss))

In [None]:
ckpt_manager.save()

'./drive/MyDrive/conv_ai_single_way_model_v2/checkpoints/train/ckpt-2'

In [None]:
# Evaluating the model
dataset[12]

[[array([1, 2, 3, 4, 5, 6, 7, 8, 3, 4], dtype=int32),
  array([ 1, 19, 20, 21, 22, 23,  5, 19, 20, 21, 22], dtype=int32)],
 array([9639], dtype=int32),
 array([1], dtype=int32)]

In [None]:
inp_sents = dataset[12][0]
targ_half_sent = dataset[12][1]
preds = conversational_model((inp_sents, targ_half_sent))

In [None]:
list(map(lambda idx: inverse_vocab_mapper[idx], tf.argmax(preds, axis=-1).numpy()))

['i']

In [None]:
list(map(lambda idx: inverse_vocab_mapper[idx], dataset[12][1]))

['//START//']

In [None]:
list(map(lambda idx: inverse_vocab_mapper[idx], dataset[12][2]))

['I']

In [None]:
prompt = "What do you do?"
targ_half_sent = '//START//'

In [None]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
prompt_data = []
words_id = np.array(list(map(lambda word: vocab_mapper[word], word_tokenize(prompt))), dtype=np.int32)
prompt_data.append([words_id])
targ_half_sent = np.array([vocab_mapper[targ_half_sent]], dtype=np.int32)
prompt_data.append(targ_half_sent)

In [None]:
preds = None
preds = tf.argmax(conversational_model(prompt_data), axis=-1).numpy()
MAX_LENGTH = 10

len_ctr = 1
while preds[-1] != vocab_mapper['//END//']:
    prompt_data = []
    prompt_data.append([words_id])
    targ_half_sent = np.append(targ_half_sent, preds[-1]).astype(np.int32)
    prompt_data.append(targ_half_sent)

    preds = tf.argmax(conversational_model(prompt_data), axis=-1).numpy()

    len_ctr += 1
    if len_ctr > MAX_LENGTH: break

In [None]:
list(map(lambda idx: inverse_vocab_mapper[idx], preds))

['I', 'am', 'a', 'student', '//END//']

In [None]:
list(map(lambda idx: inverse_vocab_mapper[idx], tf.argmax(preds, axis=-1).numpy()))

['Hello']

In [None]:
np.append(targ_half_sent, preds[-1]).astype(np.int32)

array([9639,   74], dtype=int32)

In [None]:
vocab_mapper['//END//']

9640

```
# Save the weights
model.save_weights('./checkpoints/my_checkpoint')

# Create a new model instance
model = create_model()

# Restore the weights
model.load_weights('./checkpoints/my_checkpoint')
```

In [None]:
ckpt_manager.save()

'./drive/MyDrive/conv_ai_model/checkpoints/train/ckpt-1'

In [None]:
file_path = 'tmp'
conversational_model.save_weights(file_path)

In [None]:
model = Transformer(num_enc_layers=NUM_ENC_LAYERS, num_dec_layers=NUM_DEC_LAYERS, 
                                   d_model=D_MODEL, d_ff=D_FF, num_heads=NUM_HEADS, 
                                   input_vocab_size=INPUT_VOCAB_SIZE, output_vocab_size=OUTPUT_VOCAB_SIZE)

In [None]:
model.load_weights('./checkpoints/saved_models/conv_model')