# Key-Value Attention Mechanism Homework on Keras: Character-level Machine Translation (Many-to-Many, encoder-decoder)

In this homework, you will create an MT model with key-value attention mechnism that coverts names of constituency MP candidates in the 2019 Thai general election from Thai script to Roman(Latin) script. E.g. นิยม-->niyom 

In [1]:
# !wget https://github.com/Phonbopit/sarabun-webfont/raw/master/fonts/thsarabunnew-webfont.ttf
# try:
#   # %tensorflow_version only exists in Colab.
#   %tensorflow_version 2.x
# except Exception:
#   pass

# import tensorflow as tf
# print(tf.__version__)

# import matplotlib as mpl
# from matplotlib import font_manager
# mpl.font_manager.fontManager.addfont('thsarabunnew-webfont.ttf') # 3.2+
# mpl.rc('font', family='TH Sarabun New')

In [None]:
%matplotlib inline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
import numpy as np

import random

## Load Dataset
We have generated a toy dataset using names of constituency MP candidates in 2019 Thai General Election from elect.in.th's github(https://github.com/codeforthailand/dataset-election-62-candidates) and tltk (https://pypi.org/project/tltk/) library to convert them into Roman script.

<img src="https://raw.githubusercontent.com/ekapolc/nlp_2019/master/HW8/images/dataset_diagram.png" alt="Drawing" style="width: 500px;"/>


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# import shutil
# shutil.copy("/content/drive/MyDrive/FRA 501 IntroNLP&DL/Dataset/mp_name_th_en.csv", "/content/mp_name_th_en.csv")

In [None]:
import csv
# with open('Dataset/mp_name_th_en.csv') as csvfile:
with open('Dataset/mp_name_th_en.csv', 'r', encoding='utf-8') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    name_th = []
    name_en = []
    for row in readCSV:
        name_th.append(row[0])
        name_en.append(row[1])

In [None]:
for th, en in zip(name_th[:10],name_en[:10]):
    print(th,en)

## Task1: Preprocess dataset for Keras
* 2 dictionaries for indexing (1 for input and another for output) OK!
* DON'T FORGET TO INCLUDE special token for padding OK!
* DON'T FORGET TO INCLUDE special token for the end of word symbol (output) OK!
* Be mindful of your pad_sequences "padding" hyperparameter. Choose wisely (post-padding vs pre-padding) OK!

In [None]:
#FILL YOUR CODE HERE
print("th : {:d} | en : {:d}".format(len(name_th),len(name_en)))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Reshape, SimpleRNN
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
import numpy as np

In [None]:
sequence = [[1], [2, 3], [4, 5, 6]]
b = pad_sequences(sequence, padding='pre')
a = pad_sequences(sequence, padding='post')
print(b)
print(a)


In [None]:
print(name_th[0])

In [None]:
input_chars = list(set(''.join(name_th)))
output_chars = list(set(''.join(name_en)))
# +1 for padding +1 for the end of word
data_size, vocab_size = len(name_th), len(input_chars)+2 
output_vocab_size = len(output_chars)+2

print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
maxlen_th = len( max(name_th, key=len)) #max input length
print("Max input length:", maxlen_th)
maxlen_en = len( max(name_en, key=len)) #max input length
print("Max output length:", maxlen_en)

In [None]:
sorted_chars= sorted(input_chars)
sorted_output_chars= sorted(output_chars)
sorted_chars.insert(0,"<PAD>") #PADDING for input
sorted_output_chars.insert(0,"<PAD>") #PADDING for output
sorted_chars.insert(1,"</S>") #the end of word for input
sorted_output_chars.insert(1,"</S>") #the end of word for output
#Input
char_to_ix = { ch:i for i,ch in enumerate(sorted_chars) }
ix_to_char = { i:ch for i,ch in enumerate(sorted_chars) } #reverse dictionary
#Output
output_char_to_ix = { ch:i for i,ch in enumerate(sorted_output_chars) }
ix_to_output_char = { i:ch for i,ch in enumerate(sorted_output_chars) } #reverse dictionary

print(ix_to_char)
print(ix_to_output_char)
print("len(input) : %d | len(output) %d" % (len(ix_to_char),len(ix_to_output_char)))

In [None]:
m=10887  # #sample
Tx=maxlen_th # size of input = 20
Ty=maxlen_en+1 # size of output = 19
print(Ty)

In [None]:
X = []
for line in name_th:
    temp=[]
    for char in line:
        temp.append(char_to_ix[char])
    temp.append(char_to_ix["</S>"])
    X.append(temp)
Y = []
for line in name_en:
    temp=[]
    for char in line:
        temp.append(output_char_to_ix[char])
    temp.append(char_to_ix["</S>"])
    Y.append(temp)    

print(len(Y),output_vocab_size)
print("Example X :"+str(X))
print("Example Y :"+str(Y))

X = pad_sequences(X,maxlen=maxlen_th,padding = 'pre')
Y = pad_sequences(Y,maxlen=Ty,padding = 'pre')

X= to_categorical(X,vocab_size)
X=X.reshape(data_size,maxlen_th ,vocab_size)

Y= to_categorical(Y,output_vocab_size)
Y=Y.reshape(data_size,Ty ,output_vocab_size)
print("X : %s | Y : %s" % (str(X.shape),str(Y.shape)))

In [None]:
print(X[0].shape)
print(Y[0].shape)
print(X[1].shape)
print(Y[1].shape)

# Attention Mechanism
## Task 2: Code your own (key-value) attention mechnism
* PLEASE READ: you DO NOT have to follow all the details in (Daniluk, et al. 2017). You just need to create a key-value attention mechanism where the "key" part of the mechanism is used for attention score calculation, and the "value" part of the mechanism is used to encode information to create a context vector.  
* Define global variables
* fill code for one_step_attention function
* Hint: use keras.layers.Lambda 
* Hint: you will probably need more hidden dimmensions than what you've seen in the demo


In [None]:
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Lambda
from tensorflow import split
def softMaxAxis1(x):
    return softmax(x,axis=1)


In [None]:
#These are global variables (shared layers)
## Fill your code here
## you are allowed to use code in the demo as your template. 

repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)

#Key-values (Hint)
splitter = Lambda(lambda x:split(x, num_or_size_splits=2,axis=2)) #ได้ key กับ value

#Attention function###
fattn_1 = Dense(128, activation = "tanh")
fattn_2 = Dense(1, activation = "relu")

#Attention function###
###
activator = Activation(softMaxAxis1, name='attention_scores') 
dotor = Dot(axes = 1)


In [None]:
def one_step_attention(hidden, s_prev): # (hidden,vecter)

    #Fill code here
    key, value = splitter(hidden)

    #return None # return whatever you need to complete this homework 
    s_prev = repeator(s_prev)
    concat = concatenator([key,s_prev])
    # attention function
    e = fattn_1(concat)
    energies =fattn_2(e)
    # calculate attention_scores (softmax)
    attention_scores = activator(energies)
    #calculate a context vector
    context = dotor([attention_scores,value])

    return context,attention_scores,energies

## Task3: Create and train your encoder/decoder model here
* HINT: you will probably need more hidden dimmensions than what you've seen in the demo

In [None]:
n_h = 64 #hidden dimensions for encoder 
n_s = 128 #hidden dimensions for decoder
encoder_LSTM =  Bidirectional(LSTM(n_h, return_sequences=True),input_shape=(-1, Tx, n_h*2))
decoder_LSTM_cell = LSTM(n_s, return_state = True) #decoder_LSTM_cell
output_layer = Dense(output_vocab_size, activation="softmax") #softmax output layer

In [None]:
#FILL CODE HERE :Hint --> heatmap in CNN + GradCAM

def model(Tx, Ty, n_h, n_s, vocab_size, machine_vocab_size):

    # Define the input of your model
    X = Input(shape=(Tx, vocab_size))
    # Define hidden state and cell state for decoder_LSTM_Cell
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    # print(type(s))
    # print(type(c))
    # Initialize empty list of outputs
    outputs = list()

    #Encoder Bi-LSTM
    # h = BidirectionaM(n_h, return_sequences=True),input_shape=(-1, Tx, n_h*2))(X)
    h = encoder_LSTM(X)
    #Iterate for Ty steps (Decoding)
    for t in range(Ty):
    
        #Perform one step of the attention mechanism to calculate the context vector at timestep t
        context,attention_scores,energies = one_step_attention(h, s)
       
        # Feed the context vector to the decoder LSTM cell
        s, _, c = decoder_LSTM_cell(context,initial_state=[s,c])
           
        # Pass the decoder hidden output to the output layer (softmax)
        out = output_layer(s)
        
        # Append an output list with the current output
        outputs.append(out)
    
    #Create model instance
    model = Model(inputs=[X,s0,c0],outputs=outputs)
    
    return model



def inference_encoder(Tx, Ty, n_h, n_s, vocab_size, machine_vocab_size):
    X = Input(shape=(Tx, vocab_size))

    h = encoder_LSTM(X)

    model_inference_encoder = Model(inputs=[X],outputs=h)

    return model_inference_encoder

def inference_decoder(Tx, Ty, n_h, n_s, vocab_size, machine_vocab_size):
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    h = Input(shape=(Tx,n_h*2), name='h')
    s = s0
    c = c0

    context, attention_scores, energies = one_step_attention(h, s)

    s, _, c = decoder_LSTM_cell(context, initial_state=[s, c])

    out = output_layer(s)

    model_inference_encoder = Model(inputs=[h, s0, c0], outputs=[out, s, c, attention_scores, energies])

    return model_inference_encoder




In [None]:
# FIT YOUR MODEL HERE
model = model(Tx, Ty, n_h, n_s, vocab_size, output_vocab_size)

In [None]:
# model.summary()

In [None]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Y.swapaxes(0,1))
opt=Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy')
model.fit([X, s0, c0], outputs, epochs=15, batch_size=128)


# Thai-Script to Roman-Script Translation
* Task 4: Test your model on 5 examples of your choice including your name! 
* Task 5: Show your visualization of attention scores on one of your example 

In [None]:
#task 4
#fill your code here
def prep_input(input_list):
    X = []
    for line in input_list:
        temp=[]
        for char in line:
            temp.append(char_to_ix[char])
        X.append(temp)
    X = pad_sequences(X,maxlen=maxlen_th)
    X= to_categorical(X,vocab_size)
    X=X.reshape(len(input_list),maxlen_th ,vocab_size)
    
    return X

EXAMPLES = ['สมชาย','สมศักดิ์','อดัม','ก้องภพ','ธนัท','ใจดี']
Sample = EXAMPLES
s0 = np.zeros((len(EXAMPLES), n_s))
c0 = np.zeros((len(EXAMPLES), n_s))
EXAMPLES = prep_input(EXAMPLES)

prediction = model.predict([EXAMPLES , s0, c0])
# print(type(prediction))
# print(len(prediction))
# print(prediction)
prediction = np.swapaxes(prediction,0,1)
prediction = np.argmax(prediction, axis = -1)
# print(prediction)

for j in range(len(prediction)):
    output = "".join([ix_to_output_char[int(i)] for i in prediction[j]])
    print(output)

for j in range(len(prediction)):
    output = "".join([ix_to_output_char[int(i)] for i in prediction[j]])
    output = output.replace('<PAD>','')
    output = output.replace('</S>','')
    print(Sample[j],output)  


### Plot the attention map
* If you need to install thai font: sudo apt install xfonts-thai
* this is what your visualization might look like:
--> https://drive.google.com/file/d/168J5SPSf4NNKj718wWUEDpUbh8QYZKux/view?usp=share_link

In [None]:
# hit attention_scores in one step attention

In [None]:
EXAMPLES = ['สมชาย','สมศักดิ์','อดัม','ก้องภพ','ธนัท','ใจดี']
temp = EXAMPLES
inferEncoder_model = inference_encoder(Tx, Ty, n_h, n_s, vocab_size, output_vocab_size)
inferDecoder_model = inference_decoder(Tx, Ty, n_h, n_s, vocab_size, output_vocab_size)

s0 = np.zeros((len(EXAMPLES), n_s))
c0 = np.zeros((len(EXAMPLES), n_s))
s = s0
c = c0

EXAMPLES = prep_input(EXAMPLES)
h = inferEncoder_model.predict(EXAMPLES,verbose=0)

Ty = 20
score = []
attention_list = []

for t in range(Ty):
    out,s,c,attention_scores,energies = inferDecoder_model.predict([h,s,c],verbose=0)
    score.append(out)
    # print(out)
    print(attention_scores.shape)
    print("attention_scores : ",attention_scores)
    attention_list.append(attention_scores)
    # array=np.array(out)
    
prediction = np.swapaxes(score,0,1)
prediction = np.argmax(prediction, axis = -1)

attention_list = np.array(attention_list)
output_list=[]

for j in range(len(prediction)):
    a = [ix_to_output_char[int(i)] for i in prediction[j]]
    output = "".join(a)
    output_list.append([[temp[j]],a])
    # output = output.replace('<PAD>','')
    # output = output.replace('<END>','')
    print(temp[j],output)  




In [None]:
#task 5
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family']='TH Sarabun New'  #you can change to other font that works for you
#fill your code here

In [None]:

print("(number , Ty)",prediction.shape)
attention_list = np.swapaxes(attention_list,0,1)
print(attention_list)
print("(number Tx , Ty )",attention_list.shape)

In [None]:
def get_word_indices(word_list):
    start_index = None
    end_index = None
    
    for i in range(len(word_list)):
        if word_list[i] != '<PAD>':
            if start_index is None:
                start_index = i
            if word_list[i] == '<END>':
                end_index = i
                break
    
    return start_index, end_index

In [None]:
import tensorflow as tf
print(tf.__version__)

import matplotlib
import matplotlib.font_manager as fm

fm.fontManager.addfont('thsarabunnew-webfont.ttf') # 3.2+
matplotlib.rc('font', family='TH Sarabun New')

In [None]:
def plotheat(output_list,atten):
    fig, ax = plt.subplots(nrows = 6, figsize=(10,50))
    for index,i in enumerate(output_list) :
        word = i[0]
        word_padded = ['<PAD>']*(Tx - len(word[0])) + list(word[0]) 
        start,end = get_word_indices(i[1])
        att = atten[index]
        a = att.reshape(20,20)
        a = a[start:end,:]
        print(a.shape)
        out = list(filter((lambda x : x != "<PAD>" and x != "<END>" ),i[1]) )
        print(out)
        print(a)
        sns.heatmap(a,xticklabels=word_padded, yticklabels=out, vmin=0, vmax=1 , ax=ax[index])
        ax[index].set_title(word[0])
        # break
    plt.show()

plotheat(output_list,attention_list)
# print(output_list[0][1])
# start,end = get_word_indices(output_list[0][1])
# print(start,end)