In [1]:
import pandas as pd
import numpy as np
import warnings
import re
import pickle
import tensorflow as tf
from hangul_utils import split_syllables, join_jamos
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Masking
import time

2022-12-12 16:33:52.717472: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 16:33:52.761826: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# Without this, we might get some unexpected errors; still not working
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

2022-12-12 16:33:53.456337: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 16:33:53.475438: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 16:33:53.475577: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [3]:
# Decoding code:
def one_hot_decode(mat, lang = 'en'):
    char_list = [] # To store the characters, later to be used in joining
    if lang == 'en': # if the language is english
        for row in mat:
            if np.max(row) == 0: # If the row is an empty one
                continue
            else: # If our row is not an empty row
                code = np.argmax(row) # Since 1 is the largest number in our one hot encoded, we use np.argmax()
                if code == 26: # If it's a hyphen
                    char_list.append(chr(45))
                else:
                    char_list.append(chr(code+65))
        string = "".join(char_list)
    elif lang == 'ko':
        for row in mat:
            if np.max(row) == 0:
                continue
            else:
                code = np.argmax(row)
                if code == 51:
                    char_list.append(chr(9))
                elif code == 52:
                    char_list.append(chr(10))
                else:
                    char_list.append(chr(code+12593))
        string = join_jamos("".join(char_list))
    return string

In [4]:
def decode_matrix(inputTensor, lang = 'en'):
    str_list = []
    inputTensor = np.reshape(inputTensor, (inputTensor.shape[0], inputTensor.shape[1], inputTensor.shape[2]))
    for tensor in inputTensor: # for each matrix
        string = one_hot_decode(tensor, lang)
        str_list.append(string)
    return str_list

# Managing Input and Output: `PreProcessing`

In [5]:
# read in data:
with open("LSTM_encoder_in.pkl", 'rb') as f:
    encoder_in = pickle.load(f)

2022-12-12 16:33:53.625251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 16:33:53.627460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 16:33:53.627613: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 16:33:53.627671: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node,

In [6]:
with open("LSTM_decoder_in.pkl", 'rb') as f:
    decoder_in = pickle.load(f)

In [7]:
with open("LSTM_decoder_targ.pkl", 'rb') as f:
    decoder_targ = pickle.load(f)

In [8]:
encoder_in.shape

TensorShape([86191, 36, 27])

### The inputs letters are reversed according to the paper that suggested reversing the order allows LSTM to "establish connection".
### https://proceedings.neurips.cc/paper/2014/file/a14ac55a4f27472c5d894ec1c3c743d2-Paper.pdf See Page 3.

In [9]:
one_hot_decode(encoder_in[0])

'ODHMANALLOEJ'

In [10]:
print(f"{one_hot_decode(encoder_in[0], lang='en')[::-1]} : {one_hot_decode(decoder_in[0], lang='ko')}")

JEOLLANAMHDO : 	전라남도



In [11]:
print(f"{one_hot_decode(encoder_in[10], lang='en')[::-1]} : {one_hot_decode(decoder_in[10], lang='ko')}")

BAEGUNHGIL : 	백운길



In [12]:
len(encoder_in) == len(decoder_in) == len(decoder_targ)

True

# Model


In [13]:
# https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
# https://github.com/rstudio/keras/blob/main/vignettes/examples/lstm_seq2seq.py

latent_dim = 128

# Define an input sequence and process it.
# encoder_inputs = Input(shape=(36,27))
encoder_inputs = Input(shape=(None, 27))
mask = Masking(mask_value=0.0)
masked_inputs = mask(encoder_inputs)
encoder1_1 = LSTM(latent_dim, return_sequences = True, return_state=True, name = "enc_1")
encoder1_2 = LSTM(latent_dim, return_state = True, name = "enc_2")
encoder_outputs1_1, state_h1, state_c1 = encoder1_1(masked_inputs)
encoder_outputs, state_h2, state_c2 = encoder1_2(encoder_outputs1_1)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h1, state_c1, state_h2, state_c2]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, 53))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm1_1 = LSTM(latent_dim, return_sequences=True, return_state=True, name = "dec_1")
decoder_lstm1_2 = LSTM(latent_dim, return_sequences=True, return_state=True, name = "dec_2")
decoder_outputs1_1, _, _ = decoder_lstm1_1(decoder_inputs, initial_state=encoder_states[-2:]) # Here I'm using a -> b = Latent Space = b -> a. Unpack/Decode using the 2nd layer's initial states and then the 1st.
decoder_outputs, _, _ = decoder_lstm1_2(decoder_outputs1_1, initial_state = encoder_states[:2])
decoder_dense = Dense(53, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 27)]   0           []                               
                                                                                                  
 masking (Masking)              (None, None, 27)     0           ['input_1[0][0]']                
                                                                                                  
 enc_1 (LSTM)                   [(None, None, 128),  79872       ['masking[0][0]']                
                                 (None, 128),                                                     
                                 (None, 128)]                                                     
                                                                                              

In [15]:
# parameters
epochs = 50
batch_size = 32

In [16]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [17]:
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [18]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
st_time = time.time()
model.fit([encoder_in, decoder_in], decoder_targ, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[tensorboard_callback])
print("Time taken in seconds: %g" %(time.time()-st_time))

Epoch 1/50


2022-12-12 16:34:01.684017: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2022-12-12 16:34:01.758836: W tensorflow/core/common_runtime/forward_type_inference.cc:332] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_23'
2022-12-12 16:34:01.901550: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

KeyboardInterrupt: 

In [1]:
# %tensorboard --logdir logs/fit

In [19]:
# Encoder:
encoder_model = Model(encoder_inputs, encoder_states)

In [20]:
# Decoder:
decoder_state_input_h1 = Input(shape=(latent_dim,)) # For the first LSTM layer of enc
decoder_state_input_c1 = Input(shape=(latent_dim,))
decoder_state_input_h2 = Input(shape=(latent_dim,)) # For the second LSTM layer of enc
decoder_state_input_c2 = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2]


In [21]:
decoder_outputs1_1, d_state_h1, d_state_c1 = decoder_lstm1_1(decoder_inputs, initial_state=decoder_states_inputs[-2:], training = False) # h2 c2
decoder_outputs, d_state_h2, d_state_c2 = decoder_lstm1_2(decoder_outputs1_1, initial_state = decoder_states_inputs[:2], training = False) # h1 c1
decoder_states = [d_state_h1, d_state_c1, d_state_h2, d_state_c2]


In [22]:
decoder_outputs = decoder_dense(decoder_outputs, training=False)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [23]:
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 27)]        0         
                                                                 
 masking (Masking)           (None, None, 27)          0         
                                                                 
 enc_1 (LSTM)                [(None, None, 128),       79872     
                              (None, 128),                       
                              (None, 128)]                       
                                                                 
 enc_2 (LSTM)                [(None, 128),             131584    
                              (None, 128),                       
                              (None, 128)]                       
                                                                 
Total params: 211,456
Trainable params: 211,456
Non-trainab

In [24]:
decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None, 53)]   0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 dec_1 (LSTM)                   [(None, None, 128),  93184       ['input_2[0][0]',                
                                 (None, 128),                     'input_5[0][0]',          

In [25]:
encoder_model.save("./models/char-rnn-encoder.h5")



In [26]:
decoder_model.save("./models/char-rnn-decoder.h5")



In [27]:
# One hot encoding to transform the input characters to tensor:
def one_hot_encode(string, lang = "en"):
    if lang == "en": # USE the range from A->Z 65 -> 90. UPDATE: MAKE THEM ALL UPPERCASE and then use fewer columns. 26 + 1 (hyphen) columns in total.
        arr = np.zeros(shape = (len(string)+1, 27)) # ?? why?
        string = string[::-1] # Reversing is better to establish 'communication' between encoder and decoder model // https://arxiv.org/pdf/1409.3215.pdf
        for idx, char in enumerate(string): # for each character in the string 
            if not(65 <= ord(char) <= 90) and not (97 <= ord(char) <= 122) and not(ord(char) == 45): # if it's not alphabetical or hyphen
                warnings.warn("Preprocess your input. The string contains non-alphabetical character.")
                print(f"The character is {char}")
                raise Exception
            else: # If the letter is english
                try:
                    char = char.upper() # Make it upper case
                    arr[idx][ord(char)-65] = 1
                except:
                    arr[idx][26] = 1 # If it's a hyphen then let column 26 be 1
                
        return np.expand_dims(arr, axis=0)
    elif lang == "ko": # ord of korean starts from 12593 -> 12643 = 12643 - 12593 + 1 = 51 + 2(tab(start), newline(end)) https://github.com/rstudio/keras/blob/main/vignettes/examples/lstm_seq2seq.py
        # Since Korean is the target seq, the one we are trying to predict, use '\t' as start and '\n' as end of sequence
        jamos = split_syllables(string) # Get jamos first
        jamos = '\t' + jamos + '\n'
        arr = np.zeros(shape = (len(jamos)+1, 53))
        for idx, char in enumerate(jamos):
            if ord(char) == 9: # if tab(start of sequence)
                arr[idx][51] = 1
            elif ord(char) == 10: # if newline(end of sequence)
                arr[idx][52] = 1    
            elif 12593 <= ord(char) <= 12643:
                arr[idx][ord(char)-12593] = 1
            else:
                warnings.warn("Preprocess your input. The string contains non-korean character.")
                print(f"The character is {char}")
                raise Exception
                
        return arr

In [28]:
# Masking takes place so no need to do this 
def one_hot_tensor_matrix(inputdata, lang = 'en'):
    # Languages:
    if lang == "en":
        ncol = 27 # Depends on what function you use. 
        maxrow = 36
    elif lang == "ko":
        ncol = 53
        maxrow = 28 + 2 # Since we added '\t' and '\n'
    else:
        warnings.warn("Check your language please. There are only Korean and English.")
        raise Exception
    initial_mat = np.zeros(shape=(len(inputdata), maxrow, ncol)) # The shape is of (batch size, nrow, ncol)
    for idx, string in enumerate(inputdata):
        temparr = one_hot_encode(string, maxrow, lang)
        initial_mat[idx] = temparr # Update initial matrix
    
    # Change the shape of the matrix:
    final_mat = np.reshape(initial_mat, newshape = (len(inputdata), maxrow, ncol))
    return tf.convert_to_tensor(final_mat)

In [29]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, 53))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 51] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_char = one_hot_decode(output_tokens, lang = "ko")
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > 30): # 30 is the length of the longest string in korean in training dataset
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, 53))
        target_seq[0, 0, np.argmax(output_tokens)] = 1.

        # Update states
        states_value = [h2, c2, h1, c1] # It's confusing here. We reverse the input states. That's because initially the initial states for the decoders were reversed [:2] [-2:].
        # Layer 1 of LSTM_decod takes h2 c2 from encoder 2, and Layer 2 of LSTM_decod takes h1 and c1. After training, in the inference setup, the decoder model takes input from h1,c1,h2,c2
        # and then reverses the order. After the initial states, in this code, we know that h1 and c1 should be inserted back to LSTM_decod layer 1 because that's where it came from. It needs to use
        # its own previous hidden state so that it knows how to move on. By setting the initial state from h(t-1) and c(t-1) the model is able to predict correct just as it was learned in training mode.
        
        ## The weights are the same but the values of the hidden states and cells change Wh Wc remain the same the h(t-1) and c(t-1) change which helps the decoder know how to move on.
    return join_jamos(decoded_sentence)

In [30]:
testing = ["naneun", "annyeong", "ihwa", "seochodong", "teheran", "itaewon", "garosugil", "gangnamgu", "dongdaemun", "hajun", "babo", "galwolilgil", "ahahhahahahahahahhahdsdhahahahaa"]
# tes = one_hot_tensor_matrix(testing)

In [31]:
# tes.shape

In [32]:
one_hot_encode("naneun").shape

(1, 7, 27)

In [33]:
for i in testing:
    print(decode_sequence(one_hot_encode(i)))

난은

안녕

이화

서초동

테헤란

이태원

가로수길

강남구

동대문

하준

바보

갈월일길

아하하하하사하사하아리하다여



In [36]:
test = "niga geurungae a-ni-myeon mwoya"
test = test.split()
for token in test:
    print(decode_sequence(one_hot_encode(token)))

니가

글운개

안히면

ㅁㅇㅇ여



In [35]:
test = one_hot_encode("ihajun")

In [37]:
print(decode_sequence(test))

이하준

