<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# %cd /content/drive/My Drive/Colab Notebooks/accent/src

### Import packages

In [1]:
import re, zipfile, os, io, time, string, numpy as np, matplotlib.ticker as ticker, \
            matplotlib.pyplot as plt

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

# from utils import process_raw, generate_input, get_max_len, process_data, convert

from token_list import strip_tokens
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, Input, Embedding, Dense, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
# x = ['Khoan no cua KH se den han vao 01/08/2019. So tien  2.999.898 VND, TK 12345678912. Thong tin chi tiet, lien he 1900636633. Cam on',
# 'Khoan no cua KH se den han trong 2 ngay toi. So tien  2.999.898 VND, TK 12345678912. Bo qua neu da TT. Thong tin chi tiet, lien he 1900636633. Cam on',
# 'Khoan no cua KH da qua han 1 ngay. TK 12345678912, so tien2.999.898 VND. Bo qua neu da TT. Thong tin chi tiet, lien he 18006288',
# 'Khoan no cua KH da qua han 6 ngay va bi tinh phat 250,000. TK 12345678912, so tien 2.999.898VND. Vui long TT. Thong tin chi tiet, lien he 18006288',
# 'KH da qua han 5 ky no. TK 12345678912, so tien 12.999.898VND. Vui long TT ngay lap tuc. Thong tin chi tiet, lien he 18006288',
# 'Khoan no cua KH da bi tinh phat do lien tuc vi pham. TK12345678912, so tien 12.999.898VND. Vui long TT. Thong tin chi tiet, lien he 18006288',
# 'Khoan no cua KH da qua han 91 ngay. Vui long TT toan bo29.999.898VND vao TK 12345678912 hom nay. Thong tin chi tiet, lien he 18006288',
# 'Chung toi vua nhan thanh toan khoan vay cua KH tu TK12345678912. TUY NHIEN, KH van con thieu 29.999.898VND. Vui long TT toan ngay lap tuc. Cam on']

### Custom functions

In [3]:
def process_raw(raw_data):
    raw_data = [seq.lower().strip() for seq in raw_data]

    # Creating a space between a word and the punctuation following it
    # Eg: "he is a boy." => "he is a boy ."
    raw_data = [re.sub(r"([?.!,¿])", r" \1 ", seq) for seq in raw_data]
    raw_data = [re.sub(r'[" "]+', " ", seq) for seq in raw_data]

    # Replacing everything with space except (characters, ".", "?", "!", ",")
    filtered_punctuations = string.punctuation
    exclude = [',', '!', '.', '?']

    for c in filtered_punctuations:
        if c in exclude:
            filtered_punctuations = filtered_punctuations.replace(c, '')

    table = str.maketrans('', '', filtered_punctuations)
    raw_data = [seq.translate(table) for seq in raw_data]
    
    # Append start and end tokens to sequences
    processed_raw = []
    for seq in raw_data:
        words = seq.split()
        words = [word.strip() for word in words]
        processed_raw.append(' '.join(words))

    return processed_raw

In [4]:
def generate_input(processed_raw):
    output = ''
    for char in processed_raw:
        if char in strip_tokens:
            output += strip_tokens[char]
        else:
            output += char          
    return output

In [5]:
def get_max_len(input_data, get_index=False): 
    longest = [len(data.split()) for data in input_data]
    if get_index:
        print(longest.index(max(longest)))
    return max(longest)

In [6]:
def tokenize_pad_data(data):
    tk = Tokenizer(char_level=False, filters='')
    tk.fit_on_texts(data)
    data = tk.texts_to_sequences(data)
    return data, tk

In [7]:
def convert(tokenizer, tokenized_data, send_back=False):
    original = []
    
    print('Tokenized Data: {}'.format(tokenized_data))
    
    for token in tokenized_data:
        if token != 0:
            if token in tokenizer.index_word:
                original.append(tokenizer.index_word[token])
            else:
                original.append('<unk>')
                    
    print('Original Data: {}'.format(original))
    
    if send_back:
        return original

In [8]:
def evaluate(sentence):
    attention_plot = np.zeros((max_process_seq, max_process_seq))

#     sentence = preprocess_sentence(sentence)

    inputs = [input_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_process_seq, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<s>']], 0)

    for t in range(max_process_seq):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_tokenizer.index_word[predicted_id] + ' '

        if target_tokenizer.index_word[predicted_id] == '<e>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
      
    return result, sentence, attention_plot

In [9]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [10]:
def restore(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

### Environment variables

In [11]:
# Define data arguements
# Set random seed
np.random.seed(50)
data_file = '../data/raw/raw_train.txt'

tf.__version__

'2.0.0-rc1'

### Load and process dataset

In [12]:
# Load dataset
counter = 0
max_seq_len = 20
no_seq = 12000
raw_data = []

# Load raw data and read first 100000 sequences with 40 or less words
with open(data_file, 'r', encoding='utf-8') as f:    
    while counter != no_seq:
        line = f.readline()
        if 5 <= len(line.split()) <= max_seq_len:
            raw_data.append(line)
            counter += 1
        else:
            continue

In [13]:
# Shuffle raw data prior to processing
np.random.shuffle(raw_data)

In [14]:
# Process data
processed_raw = process_raw(raw_data)

input_data = [generate_input(seq) for seq in processed_raw]
target_data = ['<s> ' + seq + ' <e>' for seq in processed_raw]

print('Sample Input: {}'.format(input_data[0]))
print('Sample Target: {}'.format(target_data[0]))

# Free up memory
del raw_data

Sample Input: nhiem ky cho cac dan bieu la hai nam .
Sample Target: <s> nhiệm kỳ cho các dân biểu là hai năm . <e>


In [15]:
# Get max sequence length after processing
max_inp_len = get_max_len(input_data, get_index=False)
max_tar_len = get_max_len(target_data, get_index=False)

In [16]:
# Generate input data word tokenizer indexes
input_tk = Tokenizer(char_level=False, filters='')
input_tk.fit_on_texts(input_data)

# Generate target data word tokenizer indexes
target_tk = Tokenizer(char_level=False, filters='')
target_tk.fit_on_texts(target_data)

inp_vocab_size = len(input_tk.word_index)+1
tar_vocab_size = len(target_tk.word_index)+1

# Reverse vocab lookup
input_tk_rev = {w:idx for idx, w in input_tk.word_index.items()}
target_tk_rev = {w:idx for idx, w in target_tk.word_index.items()}

In [17]:
print('Input vocab size: {}'.format(inp_vocab_size))
print('Target vocab size: {}'.format(tar_vocab_size))
print('Max input sequence length: {}'.format(max_inp_len))
print('Max target sequence length : {}'.format(max_tar_len))

Input vocab size: 5552
Target vocab size: 7823
Max input sequence length: 30
Max target sequence length : 32


In [18]:
x_train, x_test, y_train, y_test = train_test_split(np.array(input_data), np.array(target_data), test_size=0.1, shuffle=True)

In [19]:
# Check split
print(x_train[-1])
print(y_train[-1])

dia ban tinh bac lieu khi do bao gom ca tinh ca mau hien nay .
<s> địa bàn tỉnh bạc liêu khi đó bao gồm cả tỉnh cà mau hiện nay . <e>


In [20]:
x_train.shape, x_test.shape

((10800,), (1200,))

In [21]:
def generate_data(inp, tar):
    encoder_input_data = np.zeros((len(inp), max_inp_len), dtype='float32')
    decoder_input_data = np.zeros((len(inp), max_tar_len), dtype='float32')
    decoder_target_data = np.zeros((len(inp), max_tar_len, tar_vocab_size), dtype='float32')

    for i, (input_seq, target_seq) in enumerate(zip(inp, tar)):
        for t, word in enumerate(input_seq.split()):
            encoder_input_data[i, t] = input_tk.word_index[word]
        for t, word in enumerate(target_seq.split()):
            if t < len(target_seq.split())-1:
                decoder_input_data[i, t] = target_tk.word_index[word] # decoder input seq
            if t > 0:
                # decoder target sequence (one hot encoded)
                # does not include the START_ token
                # Offset by one timestep
                decoder_target_data[i, t - 1, target_tk.word_index[word]] = 1.
                
    return encoder_input_data, decoder_input_data, decoder_target_data

In [22]:
encoder_input_data, decoder_input_data, decoder_target_data = generate_data(x_train, y_train)

In [23]:
print('Input:')
print([input_tk_rev[int(w)] for w in encoder_input_data[50] if int(w) != 0])       
print('\n')

print('Decoder Input:')
print([target_tk_rev[int(w)] for w in decoder_input_data[50] if int(w) != 0])
print('\n')

print('Target')
print([target_tk_rev[np.argmax(w, axis=0)] for w in decoder_target_data[50] if np.argmax(w, axis=0) != 0])

Input:
['nhung', 'nguoi', 'thong', 'tri', 'da', 'dung', 'nhung', 'luan', 'diem', 'ton', 'giao', 'de', 'bien', 'minh', 'cho', 'su', 'ap', 'buc', '.']


Decoder Input:
['<s>', 'những', 'người', 'thống', 'trị', 'đã', 'dùng', 'những', 'luận', 'điểm', 'tôn', 'giáo', 'để', 'biện', 'minh', 'cho', 'sự', 'áp', 'bức', '.']


Target
['những', 'người', 'thống', 'trị', 'đã', 'dùng', 'những', 'luận', 'điểm', 'tôn', 'giáo', 'để', 'biện', 'minh', 'cho', 'sự', 'áp', 'bức', '.', '<e>']


In [24]:
test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = generate_data(x_test, y_test)

In [25]:
print('Input:')
print([input_tk_rev[int(w)] for w in test_encoder_input_data[50] if int(w) != 0])       
print('\n')

print('Decoder Input:')
print([target_tk_rev[int(w)] for w in test_decoder_input_data[50] if int(w) != 0])
print('\n')

print('Target')
print([target_tk_rev[np.argmax(w, axis=0)] for w in test_decoder_target_data[50] if np.argmax(w, axis=0) != 0])

Input:
['tu', 'mot', 'giao', 'dich', 'bien', 'thanh', 'hai', 'giao', 'dich', '.']


Decoder Input:
['<s>', 'từ', 'một', 'giao', 'dịch', 'biến', 'thành', 'hai', 'giao', 'dịch', '.']


Target
['từ', 'một', 'giao', 'dịch', 'biến', 'thành', 'hai', 'giao', 'dịch', '.', '<e>']


In [26]:
print('Number of training sequences: {}'.format(len(encoder_input_data)))
print('Number of test sequences: {}'.format(len(test_encoder_input_data)))

Number of training sequences: 10800
Number of test sequences: 1200


In [27]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(10800, 30)
(10800, 32)
(10800, 32, 7823)


In [28]:
model_ckpt = ModelCheckpoint('./models/best.h5', save_best_only=True)
early_stop = EarlyStopping(patience=4, verbose=1)

In [29]:
epochs = 100
batch_size = 32
units = 256
embedding_dim = 256
train_samples = len(x_train)
test_samples = len(x_test)

In [30]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(inp_vocab_size, embedding_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [31]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, embedding_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [32]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    1421312     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    2002688     input_2[0][0]                    
____________________________________________________________________________________________

In [33]:
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
          batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[model_ckpt, early_stop])

Train on 8640 samples, validate on 2160 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: early stopping


In [34]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [35]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_tk.word_index['<s>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tk_rev[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '<e>' or len(decoded_sentence) > max_tar_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [36]:
rand = np.random.randint(1, len(x_train))
print('Input:')
print([target_tk_rev[np.argmax(w, axis=0)] for w in decoder_target_data[rand] if np.argmax(w, axis=0) != 0])

decode_sequence(np.array([encoder_input_data[rand]]))

Input:
['giáp', 'có', 'các', 'đặc', 'điểm', 'sau', '<e>']


' giáp có các đặc điểm sau <e>'