In [1]:
import pandas as pd
import re
import tensorflow as tf
from time import time

from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv('Data/dialogs.txt', sep='\t', names=['Question', 'Answer'])

In [3]:
for col in df.columns:
    # Lower text + removing unwanted characters.
    df[col] = df[col].apply(lambda x: re.sub(r"[^a-z?.!,:]+", " ", x.lower()))

    # Separate punctuation to be their own tokens.
    df[col] = df[col].apply(lambda x: re.sub(r"([?.!,:])", r" \1 ", x).strip())

    # Clean duplicate empty spaces 
    df[col] = df[col].apply(lambda x: re.sub(r'[" "]+', " ", x))

    # Adding start and end tokens
    df[col] = '<s> ' + df[col] + ' <e>'

In [4]:
df.head()

Unnamed: 0,Question,Answer
0,"<s> hi , how are you doing ? <e>",<s> i m fine . how about yourself ? <e>
1,<s> i m fine . how about yourself ? <e>,<s> i m pretty good . thanks for asking . <e>
2,<s> i m pretty good . thanks for asking . <e>,<s> no problem . so how have you been ? <e>
3,<s> no problem . so how have you been ? <e>,<s> i ve been great . what about you ? <e>
4,<s> i ve been great . what about you ? <e>,<s> i ve been good . i m in school right now ....


In [5]:
def tokenize(language):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(language)

    return tokenizer

def vectorize(tokenizer, language):
    tensor = tokenizer.texts_to_sequences(language)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor

In [6]:
x_tokenizer = tokenize(df['Question'])
y_tokenizer = tokenize(df['Answer'])

x_tensor = vectorize(x_tokenizer, df['Question'])
y_tensor = vectorize(y_tokenizer, df['Answer'])

max_length_x = x_tensor.shape[1]
max_length_y = y_tensor.shape[1]

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_tensor, y_tensor, test_size=0.3)

In [8]:
buffer_size = len(x_train)
batch_size = 64

steps_per_epoch = len(x_train) // batch_size

embedding_dim = 256
units = 1024

vocab_inp_size = len(x_tokenizer.word_index) + 1
vocab_tar_size = len(y_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(buffer_size=buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))

print(example_input_batch.shape, example_target_batch.shape)

(64, 24) (64, 24)


In [9]:
# Encoder

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()

        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
            )
        
    def call(self, x, hidden):
        x = self.embedding(x)    
        output, state = self.gru(x, initial_state=hidden)

        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [10]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print(sample_output.shape, sample_hidden.shape)

(64, 24, 1024) (64, 1024)


In [11]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [12]:
attention_layer = BahdanauAttention(10)

attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print(attention_result.shape, attention_weights.shape)

(64, 1024) (64, 24, 1)


In [13]:
# Decoder

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.gru = tf.keras.layers.GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )

        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

In [14]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden, sample_output)

print(sample_decoder_output.shape)

(64, 2350)


In [15]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [16]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([y_tokenizer.word_index['<s>']] * batch_size, 1)

        for t in range(1, targ.shape[1]):

            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [17]:
epochs = 40

for ep in range(1, epochs + 1):
    epoch_start_time = time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    print(f'Epoch {ep}, loss: {total_loss/steps_per_epoch}, Epoch time: {time() - epoch_start_time:.2f}s')

Epoch 1, loss: 2.152242422103882, Epoch time: 92.29s
Epoch 2, loss: 1.8102943897247314, Epoch time: 71.38s
Epoch 3, loss: 1.6479251384735107, Epoch time: 72.42s
Epoch 4, loss: 1.5498902797698975, Epoch time: 71.32s
Epoch 5, loss: 1.4789741039276123, Epoch time: 70.63s
Epoch 6, loss: 1.4075080156326294, Epoch time: 70.72s
Epoch 7, loss: 1.3519386053085327, Epoch time: 70.27s
Epoch 8, loss: 1.298085331916809, Epoch time: 69.35s
Epoch 9, loss: 1.2458301782608032, Epoch time: 69.40s
Epoch 10, loss: 1.1887924671173096, Epoch time: 69.42s
Epoch 11, loss: 1.150111436843872, Epoch time: 70.74s
Epoch 12, loss: 1.1070246696472168, Epoch time: 69.79s
Epoch 13, loss: 1.0669739246368408, Epoch time: 69.61s
Epoch 14, loss: 1.0265415906906128, Epoch time: 69.67s
Epoch 15, loss: 0.9885061979293823, Epoch time: 70.05s
Epoch 16, loss: 0.9489414095878601, Epoch time: 70.14s
Epoch 17, loss: 0.9110711216926575, Epoch time: 71.52s
Epoch 18, loss: 0.874193012714386, Epoch time: 70.59s
Epoch 19, loss: 0.83376

In [21]:
def remove_tags(sentence):
    return sentence.split('<s>')[-1].split('<e>')[0]

def evaluate(sentence):
    # preprocessing
    sentence = re.sub(r"[^a-z?.!,:]+", " ", sentence.lower())
    sentence = re.sub(r"([?.!,:])", r" \1 ", sentence).strip()
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = '<s> ' + sentence + ' <e>'

    inputs = [x_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_x, padding='post')

    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([y_tokenizer.word_index['<s>']], 0)

    for t in range(max_length_y):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += y_tokenizer.index_word[predicted_id] + ' '

        if y_tokenizer.index_word[predicted_id] == '<e>':
            return remove_tags(result), remove_tags(sentence)
        
        dec_input = tf.expand_dims([predicted_id], 0)

    return remove_tags(result), remove_tags(sentence)

def ask(sentence):
    result, sentence = evaluate(sentence)

    print(f'Question: {sentence}')
    print(f'Predicted answer: {result}')

In [22]:
ask('hi, how are you doing?')

Question:  hi , how are you doing ? 
Predicted answer: i m fine . how about yourself ? 


In [23]:
ask('Are you good ?')

Question:  are you good ? 
Predicted answer: i m average . 


In [32]:
ask('Are you in school')

Question:  are you in school 
Predicted answer: i hope so . i hope so . i hope so . i hope so . i hope so . i hope so . 


In [33]:
ask('Why do you mean?')

Question:  why do you mean ? 
Predicted answer: what do you mean , too . 
