In [1]:
import pandas as pd
import numpy as np
import warnings
import ast # To read-in a dict file
import re
import tensorflow as tf
from tensorflow.keras import layers, models
from hangul_utils import split_syllables, join_jamos
import pickle

2022-12-12 14:45:26.321902: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 14:45:26.364359: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [51]:
with open("/home/tyson/Private/codes/romanization/final_dict.pkl", "rb") as f:
    result_dict = pickle.load(f)

In [52]:
kor_list = list(result_dict.keys())
eng_list = list(result_dict.values())

In [53]:
for idx, item in enumerate(eng_list):
    if ' ' in item:
        print(idx, item)

1434 Daeguoegwaksunhwan Expressway


In [54]:
kor_list[1434]

'대구외곽순환고속도로'

In [55]:
kor_list.pop(1434)
eng_list.pop(1434)

'Daeguoegwaksunhwan Expressway'

In [56]:
for idx, item in enumerate(eng_list):
    if re.search(r'[0-9]', item):
        print(idx, item)

4717 3sandan
5358 Je3ttanggul-ro
5370 Gukgajeongwon1ho-gil


In [58]:
for i in [4717, 5358, 5370][::-1]: # Reverse the list as it modifies the indices
    del kor_list[i]
    del eng_list[i]

In [72]:
new_dict = {}
for i in range(len(kor_list)):
    new_dict[kor_list[i]] = eng_list[i].lower()

In [73]:
len(new_dict)

86191

In [74]:
kor_list = list(new_dict.keys())
eng_list = list(new_dict.values())

In [75]:
# Get the number of the largest string in the input
maxrow_en = 0
for inputstring in eng_list: # Korean is always less in size
    if len(inputstring) >= maxrow_en:
        maxrow_en = len(inputstring) # update
print(f"Maximum number of characters : {maxrow_en}")

Maximum number of characters : 36


In [76]:
# Get the number of the largest string in the input
maxrow_ko = 0
for inputstring in kor_list:
    inputstring = split_syllables(inputstring)
    if len(inputstring) >= maxrow_ko:
        maxrow_ko = len(inputstring) # update
print(f"Maximum number of characters : {maxrow_ko}")

Maximum number of characters : 28


In [77]:
# One hot encoding to transform the input characters to tensor:
def one_hot_encode(string, maxrow, lang = "en"):
    if lang == "en": # USE the range from A->Z 65 -> 90. UPDATE: MAKE THEM ALL UPPERCASE and then use fewer columns. 26 + 1 (hyphen) columns in total.
        arr = np.zeros(shape = (maxrow, 27))
        string = string[::-1] # Reversing is better to establish 'communication' between encoder and decoder model // https://arxiv.org/pdf/1409.3215.pdf
        for idx, char in enumerate(string): # for each character in the string 
            if not(65 <= ord(char) <= 90) and not (97 <= ord(char) <= 122) and not(ord(char) == 45): # if it's not alphabetical or hyphen
                warnings.warn("Preprocess your input. The string contains non-alphabetical character.")
                print(f"The character is {char}")
                raise Exception
            else: # If the letter is english
                try:
                    char = char.upper() # Make it upper case
                    arr[idx][ord(char)-65] = 1
                except:
                    arr[idx][26] = 1 # If it's a hyphen then let column 26 be 1
                
        return arr
    elif lang == "ko": # ord of korean starts from 12593 -> 12643 = 12643 - 12593 + 1 = 51 + 2(tab(start), newline(end)) https://github.com/rstudio/keras/blob/main/vignettes/examples/lstm_seq2seq.py
        arr = np.zeros(shape = (maxrow, 53))
        # Since Korean is the target seq, the one we are trying to predict, use '\t' as start token and '\n' as end of sequence token
        jamos = split_syllables(string) # Get jamos first
        jamos = '\t' + jamos + '\n'
        for idx, char in enumerate(jamos):
            if ord(char) == 9: # if tab(start of sequence)
                arr[idx][51] = 1
            elif ord(char) == 10: # if newline(end of sequence)
                arr[idx][52] = 1    
            elif 12593 <= ord(char) <= 12643:
                arr[idx][ord(char)-12593] = 1
            else:
                warnings.warn("Preprocess your input. The string contains non-korean character.")
                print(f"The character is {char}")
                raise Exception
                
        return arr

In [78]:
# Fixed length:
def one_hot_tensor_matrix(inputdata, lang = 'en'):
    # Languages:
    if lang == "en":
        ncol = 27 # Depends on what function you use. 
        maxrow = maxrow_en
    elif lang == "ko":
        ncol = 53
        maxrow = maxrow_ko + 2 # Since we added '\t' and '\n'
    else:
        warnings.warn("Check your language please. There are only Korean and English.")
        raise Exception
    initial_mat = np.zeros(shape=(len(inputdata), maxrow, ncol)) # The shape is of (batch size, nrow, ncol)
    for idx, string in enumerate(inputdata):
        temparr = one_hot_encode(string, maxrow, lang)
        initial_mat[idx] = temparr # Update initial matrix
    
    # Change the shape of the matrix:
    final_mat = np.reshape(initial_mat, newshape = (len(inputdata), maxrow, ncol))
    return tf.convert_to_tensor(final_mat)

In [79]:
def one_hot_decode(mat, lang = 'en'):
    char_list = [] # To store the characters, later to be used in joining
    if lang == 'en': # if the language is english
        for row in mat:
            if np.max(row) == 0: # If the row is an empty one
                continue
            else: # If our row is not an empty row
                code = np.argmax(row) # Since 1 is the largest number in our one hot encoded, we use np.argmax()
                if code == 26: # If it's a hyphen
                    char_list.append(chr(45))
                else:
                    char_list.append(chr(code+65))
        string = "".join(char_list)
    elif lang == 'ko':
        for row in mat:
            if np.max(row) == 0:
                continue
            else:
                code = np.argmax(row)
                if code == 52:
                    char_list.append(chr(9))
                elif code == 53:
                    char_list.append(chr(10))
                else:
                    char_list.append(chr(code+12593))
        string = join_jamos("".join(char_list))
    return string

In [80]:
# Get the maximum length of the input words in each kor_list and eng_list. This will be used when we initialize the model. Use the above functions accordingly. For fixed: one_hot_tensor_matrix
decoder_input = one_hot_tensor_matrix(kor_list, lang = 'ko')

In [81]:
# Remove the first line since decoder target preceeds the decoder input by one timestep so, a 'start' signal would give an output of a word to be compared with decoder target
decoder_target = decoder_input[:, 1:, :]

In [82]:
# Make a new line and add it to the second dimension of the decoder target tensor so that we have equal shape with the predicted val and the actual val
# Fixed length
new_line = tf.zeros((decoder_target.shape[0], 1, decoder_target.shape[2]), tf.float64)
new_line

<tf.Tensor: shape=(86191, 1, 53), dtype=float64, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]])>

In [83]:
# Fixed Length
decoder_target = tf.concat([decoder_target, new_line], 1)

In [84]:
assert(decoder_target.shape == decoder_input.shape)

In [85]:
encoder_input = one_hot_tensor_matrix(eng_list, lang = 'en')

In [86]:
len(encoder_input) == len(decoder_target) == len(decoder_input)

True

In [87]:
encoder_input = tf.cast(encoder_input, tf.float32)
decoder_input = tf.cast(decoder_input, tf.float32)
decoder_target = tf.cast(decoder_target, tf.float32)

In [88]:
with open("LSTM_encoder_in.pkl", "wb") as f:
    pickle.dump(encoder_input, f)
with open("LSTM_decoder_in.pkl", "wb") as f:
    pickle.dump(decoder_input, f)
with open("LSTM_decoder_targ.pkl", "wb") as f:
    pickle.dump(decoder_target, f)