In [1]:
# import statements
import numpy as np
import pandas as pd
from VanillaModels import *
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_addons.optimizers import RectifiedAdam, Lookahead, NovoGrad, SGDW, AdamW

In [2]:
### BEST PARAMETERS ###
LAYER_TYPE = "lstm"
NUM_RECURRENT_UNITS = 256
ENC_EMBED_DIM = 64
DEC_EMBED_DIM = 256
DROPOUT = 0.2
NUM_ENCODER_RECURRENT_LAYERS = 2
NUM_DECODER_RECURRENT_LAYERS = 3
OPTIMIZER = "adamw"
LR = 0.01
WEIGHT_DECAY = 0.001
BATCH_SIZE = 256

In [3]:
# directory paths
train_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
dev_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [4]:
# define a function to compute word-level accuracy
# after stripping of all pad-tokens
def compute_word_accuracy(y_true, y_pred, tokens):
    # count to keep track of correct predictions
    # and complete set of predictions and targets
    count, S_y, S_t = 0, [], []

    for t, y in zip(y_true, y_pred):
        # s_t and s_y are the target and prediction
        s_y, s_t = '', ''
        for i in y:
            c = tokens[int(i)]
            # if we encounter stop-token, stop forming the word
            if c == '>':
                break 
            # else add the character to the string
            s_y += c
        # strip all unnecessary characters and append to set of all predictions
        s_y = s_y.strip()
        S_y.append(s_y)
        for i in t:
            c = tokens[int(i)]
            # if we encounter stop-token, stop forming the word
            if c == '>':
                break 
            # else add the character to the string
            s_t += c
        # strip all unnecessary characters and append to set of all predictions
        s_t = s_t.strip()
        S_t.append(s_t)
        # check if the target word == predicted word
        count += int(s_t == s_y)

    # create a dataframe from all the targets and predictions
    df = pd.DataFrame(list(zip(S_t, S_y)), columns=['Target', 'Prediction']) 
    # to compute accuracy, divide by total number of items in the dataset
    # return both accuracy and dataframe
    return count/len(y_true), df

In [5]:
# a function to read the data into a pd dataframe
def load_data(path):
    data = pd.read_csv(path, 
                       sep='\t',
                       encoding="utf8",
                       names=["hi","en","_"], 
                       skip_blank_lines=True)
                           
    data = data[data['hi'].notna()]
    data = data[data['en'].notna()]
    data = data[['hi','en']]
    return data

In [6]:
# a function to preprocess the data
def pre_process(data, max_eng_len, max_hin_len, eng_token_map, hin_token_map):
    x = data['en'].values 
    # add start and end tokens to the hindi word
    y = '<' + data['hi'].values + '>'
    
    # a is the encoder input
    a = np.zeros((len(x), max_eng_len))
    # b is the decoder input (has start-token and end-token)
    b = np.zeros((len(y), max_hin_len))
    # c is the decoder output, which leads the decoder input by one step
    # as it does not have start token in the beginning
    c = np.zeros((len(y), max_hin_len))
    
    # replace the characters by numbers so that the model can process them
    # use a inverted_index to map the characters to integers
    # these integers are just the index when the vocabulary characters are sorted
    for i, (xx, yy) in enumerate(zip(x, y)):
        for j, ch in enumerate(xx):
            a[i, j] = eng_token_map[ch]
        for j, ch in enumerate(yy):
            b[i, j] = hin_token_map[ch]
            if j > 0:
                c[i, j-1] = hin_token_map[ch]
    return a, b, c

In [7]:
# load the train, validation and test data
train = load_data(train_dir)
dev = load_data(dev_dir)
test = load_data(test_dir)

# add start and end tokens to the hindi word
# now generate the english and hindi vocabulary
x = train['en'].values
y = '<' + train['hi'].values + '>'

# get the set of all unique characters, i.e. the vocabulary
eng_tokens = set()
hin_tokens = set()
for xx, yy in zip(x,y):
    for ch in xx:
        eng_tokens.add(ch)
    for ch in yy:
        hin_tokens.add(ch)

# sort the characters and create a inverted_index 
# to map the characters to their index in the vocabulary
eng_tokens = sorted(list(eng_tokens))
hin_tokens = sorted(list(hin_tokens))
eng_token_map = dict([(ch, i+1) for i, ch in enumerate(eng_tokens)])
hin_token_map = dict([(ch, i+1) for i, ch in enumerate(hin_tokens)])
eng_tokens.insert(0, ' ')
hin_tokens.insert(0, ' ')
eng_token_map[' '] = 0
hin_token_map[' '] = 0
max_eng_len = max([len(xx) for xx in x])
max_hin_len = max([len(yy) for yy in y])

# get the training encoder input, decoder input and decoder target
trainxe, trainxd, trainy = pre_process(train, 
                                       max_eng_len, 
                                       max_hin_len, 
                                       eng_token_map, 
                                       hin_token_map)

# get the validation encoder input, decoder input and decoder target
valxe, valxd, valy = pre_process(dev, 
                                 max_eng_len, 
                                 max_hin_len, 
                                 eng_token_map, 
                                 hin_token_map)

# get the test encoder input, decoder input and decoder target
# ignore the decoder target and only use it to check the metrics at the end
testxe, testxd, testy = pre_process(test,
                                    max_eng_len, 
                                    max_hin_len, 
                                    eng_token_map, 
                                    hin_token_map)

In [8]:
# Since we have custom objects, we can't save the model so easily
# Therefore, we have to re-train the model with the test parameters again
# create the encoder with the best hyperparameters
encoder = Encoder(input_dim=int(trainxe.max())+1,
                  embed_dim=ENC_EMBED_DIM,
                  cell_hidden_dim=NUM_RECURRENT_UNITS,
                  dropout=DROPOUT,
                  k=NUM_ENCODER_RECURRENT_LAYERS, 
                  cell_type=LAYER_TYPE)

# create the decoder with the best hyperparameters
decoder = Decoder(input_dim=int(trainxd.max())+1, 
                  output_dim=int(trainy.max())+1, 
                  embed_dim=DEC_EMBED_DIM,
                  cell_hidden_dim=NUM_RECURRENT_UNITS,
                  dropout=DROPOUT,
                  k=NUM_DECODER_RECURRENT_LAYERS,
                  cell_type=LAYER_TYPE)

# create the transliteration model with the created encoder and decoder
model = TransliterationModel(encoder=encoder, 
                             decoder=decoder, 
                             tgt_max_len=max_hin_len)

# instantiate and use the best optimizer
optimizer = {
    "ranger": Lookahead(RectifiedAdam(learning_rate=LR, weight_decay=WEIGHT_DECAY, amsgrad=True)),
    "adamw": AdamW(learning_rate=LR, weight_decay=WEIGHT_DECAY, amsgrad=True),
    "sgdw": SGDW(learning_rate=LR, weight_decay=WEIGHT_DECAY, momentum=0.9, nesterov=True),
    "novograd": NovoGrad(learning_rate=LR, weight_decay=WEIGHT_DECAY, amsgrad=True)
}[OPTIMIZER]

# define early stopping to terminate the run if the validation accuracy drops
# continously for 4 times
early_stop = EarlyStopping(monitor="val_accuracy",
                           patience=4,
                           restore_best_weights=True,
                           min_delta=1e-3)
                           
# compile the model and fit it to the data
model.compile(optimizer=optimizer, 
              loss="sparse_categorical_crossentropy", 
              metrics=["accuracy"])

model.fit([trainxe, trainxd], 
          trainy, 
          epochs=25, 
          callbacks=[early_stop],
          batch_size=BATCH_SIZE,
          validation_data=([valxe, valxd], valy), 
          shuffle=True)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25


<keras.callbacks.History at 0x24ce142fc40>

In [9]:
# get predictions for the test data
# use the encoder input for the encoder and start-tokens for the decoder
test_pred = model.predict([testxe, testxd[:, 0]], 
                          batch_size=BATCH_SIZE)
                          
# obtain the test word-level accuracy and complete set of predictions
test_word_accuracy, df = compute_word_accuracy(testy.tolist(), 
                                               test_pred.tolist(), 
                                               hin_tokens)
df.insert(loc=0, column="data", value=test['en'])

# save the predictions as a csv file
print(f"Test_word_accuracy: {test_word_accuracy:.4f}\n")
df.to_csv("./predictions_vanilla/predictions.csv", encoding="utf-8")

# sample 10 random predictions and display them with color
df = df.sample(frac=0.0022).values.tolist()

# print the predictions and target with colors
# if the prediction is incorrect, it is printed in red
# else if it is correct, it is printed in green
for (x, y, z) in df:
    color = '\033[91m' if y != z else '\033[92m'
    print(x, y, f"{color}{z}\033[00m")

Test_word_accuracy: 0.3880

mosad मोसाद [91mमोसद[00m
gandaki गंडकी [92mगंडकी[00m
aarushi आरुषि [91mआरुषी[00m
raubadar रौबदार [92mरौबदार[00m
bolane बोलने [92mबोलने[00m
chhah छह [91mछाह[00m
urvarata उर्वरता [92mउर्वरता[00m
rugna रुग्ण [91mरूगना[00m
piki पिकी [91mपीकी[00m
domenic डोमेनिक [92mडोमेनिक[00m
