In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from matplotlib import pyplot as plt

import os
import time
from tensorflow import keras
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Dense, Embedding, GRU, Bidirectional, Concatenate, Dropout, Layer, Add, LayerNormalization
from keras.utils import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import edit_distance

from funkcije import *

2023-09-14 11:35:13.370874: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-14 11:35:13.401415: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/ten

In [2]:
#Mozda resi problem sa memorijom GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

checkpoint = ModelCheckpoint('model_weights_{epoch}.h5', save_best_only=False, save_weights_only=True, monitor='val_loss', mode='min')
#Ckeckpoint se vise ne koristi
early_stopping = EarlyStopping(patience = 10, restore_best_weights = True, monitor = 'val_loss', mode = 'min', verbose = 1)

embedding_size = 300

2023-09-14 11:35:14.960952: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 11:35:14.976847: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 11:35:14.976995: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [3]:
#Embedding size, input_pad_len i target_pad_len su fiksni
#Dropout se primenjuje na vise mesta, i svuda je isti dropout_rate
#Na samom pocetku modelu se primenjuje custom_dropout, kako bi naucio da radi sa <Unknown> tokenom bolje
#Encoder ima 3 GRU sloja, poslendji je dvosmeran; izmedju svaka 2 postoje rezidualne veze
#Posto je poslednji sloj decodera dvosmeran GRU, latentna dimenzija dekodera je duplo veca
#Deocoder ima 2 GRU sloja, izmedju postoje rezidualne veze
#Metoda translate sekvencijalni prevodi podatke rec po rec, zbog toga poziva dekoder onoliko puta, koliko je maksimalna duzina target recenice
class GRU_Translation_Model(Model):
    def __init__(self, num_input_words, num_target_words, input_embedding_matrix, target_embedding_matrix, latent_dim = 256, dropout_rate = 0.5, custom_dropout_rate = 0.05):
        super(GRU_Translation_Model, self).__init__()
        
        self.latent_dim = latent_dim
        self.dropout_rate = dropout_rate
        self.custom_dropout_rate = custom_dropout_rate
        self.num_input_words = num_input_words
        self.num_target_words = num_target_words
        self.input_embedding_matrix = input_embedding_matrix
        self.target_embedding_matrix = target_embedding_matrix
        self.embedding_size = 300
        self.input_pad_len = 80
        self.target_pad_len = 60
        
        encoder_input_tensor = Input(shape = (self.input_pad_len, ))
        modified_input = CustomDropout(1.0, custom_dropout_rate)(encoder_input_tensor)

        encoder_embedding_layer = Embedding(input_dim = num_input_words + 1, output_dim = self.embedding_size, mask_zero = True, weights = [input_embedding_matrix], trainable = True)
        encoder_embedding = encoder_embedding_layer(modified_input)
        outputs = GRU(units = latent_dim, return_sequences = True, dropout = dropout_rate)(encoder_embedding)
        outputs = Dense(units = self.embedding_size, activation = 'relu')(outputs)
        outputs = LayerNormalization()(outputs)
        #outputs = Dropout(dropout_rate)(outputs)
        next_inputs = Add()([encoder_embedding, outputs])
        outputs = GRU(units = latent_dim, return_sequences = True, dropout = dropout_rate)(next_inputs)
        outputs = Dense(units = self.embedding_size, activation = 'relu')(outputs)
        outputs = LayerNormalization()(outputs)
        main_inputs = Add()([next_inputs, outputs])
        main_inputs = LayerNormalization()(main_inputs)
        _, forward_state, backward_state = Bidirectional(GRU(units = latent_dim, return_state = True, dropout = dropout_rate))(main_inputs)
        state_h = Concatenate(axis=-1)([forward_state, backward_state])

        self.encoder = Model(encoder_input_tensor, state_h)
        
        decoder_input_tensor = Input(shape = (None, ))
        decoder_starting_state = Input(shape = (latent_dim*2,))
        decoder_embedding_layer = Embedding(input_dim = num_target_words + 1, output_dim = self.embedding_size, mask_zero = True, weights = [target_embedding_matrix], trainable = True)
        decoder_embedding = decoder_embedding_layer(decoder_input_tensor)
        decoder_outputs = GRU(units = latent_dim*2, return_sequences = True, return_state = False, dropout = dropout_rate)(decoder_embedding, initial_state = decoder_starting_state)
        decoder_outputs = Dense(units = self.embedding_size, activation = 'relu')(decoder_outputs)
        main_inputs = Add()([decoder_embedding, decoder_outputs])
        main_inputs = LayerNormalization()(main_inputs)
        decoder_outputs, decoder_state = GRU(units = latent_dim*2, return_sequences = True, return_state = True, dropout = dropout_rate)(main_inputs, initial_state = decoder_starting_state) 
        output = Dropout(dropout_rate)(decoder_outputs)
        output = Dense(units = num_target_words + 1, activation = 'softmax')(output)
        
        self.decoder = Model([decoder_input_tensor, decoder_starting_state], [output, decoder_state])
        
    def call(self, x):
        encoder_output = self.encoder(x[0])
        decoder_output,_ = self.decoder([x[1],encoder_output])
        return decoder_output
    
    #Encodes encoder_input and then decodes sequentially, with decoder_input as the starting input
    def translate(self, encoder_input, decoder_input):
        decoder_state = self.encoder(encoder_input)
        data_size = encoder_input.shape[0]
        decoder_output = np.zeros((data_size, self.target_pad_len - 1))
        for i in range(self.target_pad_len - 1):
            decoder_output_temp, decoder_state = self.decoder.predict([decoder_input, decoder_state], verbose = 0, batch_size = 128)
            next_words = np.argmax(decoder_output_temp, axis = -1)
            decoder_input = next_words
            decoder_output[:, i] = next_words.reshape((data_size,))
        return decoder_output
  

In [4]:
#Trenira model na train_data i evaluira ga na val_data
#Embedding learning rate je poseban learning_rate koji se koristi u embedding slojevima, iz razloga sto oni vec imaju pretrenirane podatke za pocetne vrednosti
#Model se trenira dok val_loss ne krene da raste, i cuva tezine epohe koja ima najbolji val_loss
#Koristi se u cv_evaluate, napravio funkciju jer inace dolazi do prekoracenje GPU RAMa, neko je napisao da je do unakrsne validacije
def train_and_evaluate(train_data, val_data, epochs = 200, batch_size = 128, learning_rate = 0.001, latent_dim = 256, dropout_rate = 0.5, embedding_learning_rate = 0.001):
     
     input_texts, target_texts = clean_texts(train_data.iloc[:,1], train_data.iloc[:,0])
     input_word_index, target_word_index, max_input_seq_len, max_target_seq_len = analyse_texts(input_texts, target_texts)
     input_pad_len = 80
     target_pad_len = 60
     num_input_words = len(input_word_index) - 1
     num_target_words = len(target_word_index) - 1
     #print(num_input_words)
     inverted_input_word_index = {value: key for key,value in input_word_index.items()}
     inverted_target_word_index = {value: key for (key,value) in target_word_index.items()}
     #print(len(inverted_input_word_index))
     input_embedding_matrix, target_embedding_matrix = load_embedding_data_get_matrices(inverted_input_word_index, inverted_target_word_index)
     print('Embeddings loaded.')
     encoder_input_data, decoder_input_data, decoder_output_data = create_model_data(input_texts, target_texts, input_word_index, target_word_index, input_pad_len, target_pad_len)
     #print(input_embedding_matrix.shape)
     
     input_texts_val, target_texts_val = clean_texts(val_data.iloc[:,1], val_data.iloc[:,0])
     encoder_input_data_val, decoder_input_data_val, decoder_output_data_val = create_model_data(input_texts_val, target_texts_val, input_word_index, target_word_index, input_pad_len, target_pad_len)
     
     #print('Data preprocessed.')
     model_gru = GRU_Translation_Model(num_input_words, num_target_words, input_embedding_matrix, target_embedding_matrix, latent_dim = latent_dim, dropout_rate = dropout_rate)
     #print('Model loaded.')
     other_layers = model_gru.layers[0].layers + model_gru.layers[1].layers #Mora da se prilagodi za transformer
     embedding_layers = [other_layers.pop(2), other_layers.pop(-9)] #Paznja! Mora se prilagoditi svaki put kad se model menja

     optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers = [(Adam(learning_rate), other_layers), (Adam(embedding_learning_rate), embedding_layers)])
     model_gru.compile(optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['acc'])
     #print('Model compiled.')
     history = model_gru.fit([encoder_input_data, decoder_input_data], decoder_output_data, validation_data = ([encoder_input_data_val, decoder_input_data_val], decoder_output_data_val), epochs = epochs, batch_size = batch_size, callbacks = [early_stopping], verbose = 1)
     #print('Model fit.')
     best_epoch = np.argmin(history.history['val_loss']) + 1
     
     #print('Best epoch: ', best_epoch)
     best_loss = np.min(history.history['val_loss'])
     #print('Best loss:', best_loss)
     #print(model_gru.evaluate([encoder_input_data_val, decoder_input_data_val], decoder_output_data_val))
     
     wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1 = evaluate(model_gru, input_texts_val, target_texts_val, input_word_index, target_word_index, inverted_target_word_index, input_pad_len, target_pad_len)
     return best_epoch, best_loss, wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1
 

In [5]:
#Trenira po model za svaki fold, racuna WER, smooth BLEU(1,2,3,4), kao i val_loss i broj epoha do konvergencije
#Vraca podatke iz svake instance modela, odnosno za svaki fold, da bi se dalje procesirale
def cv_evaluate(train_val_data = None, df_folds = None, folds = 5, epochs = 200, batch_size = 128, learning_rate = 0.001, latent_dim = 256, dropout_rate = 0.5, embedding_learning_rate = None):
    if embedding_learning_rate == None:
        embedding_learning_rate = learning_rate
    if df_folds == None:
        df_np = train_val_data.to_numpy()
        np.random.shuffle(df_np)
        total_size = df_np.shape[0]
        fold_size = total_size/folds
        df_folds = [df_np[int(i*fold_size):int((i+1)*fold_size),] for i in range(folds)]
    #input_word_embeddings, target_word_embeddings = load_embedding_data() #Doslo je do prekoracenje memorije
    losses = []
    best_epochs = []
    wers = []
    smooth_bleu1s = []
    smooth_bleu2s = []
    smooth_bleu3s = []
    smooth_bleu4s = []
    for i in range(folds):
        train_folds = [fold for j, fold in enumerate(df_folds) if j!=i]
        train_folds_pd = [pd.DataFrame(data = fold) for fold in train_folds]
        train_data = pd.concat(train_folds_pd)
        val_data = pd.DataFrame(df_folds[i])
        print('Current Latent Dim:', latent_dim)
        print('Current Dropout Rate: ', dropout_rate)
        print('Current Fold: {}/{}'.format(i+1, folds))
        print('Current Learning Rate: ', learning_rate)
        print('Current Learning Rate Multiplier: ', embedding_learning_rate/learning_rate)
        
        best_epoch, best_loss, wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1 = train_and_evaluate(train_data, val_data, epochs = epochs, batch_size = batch_size, learning_rate = learning_rate, latent_dim = latent_dim, dropout_rate = dropout_rate, embedding_learning_rate = embedding_learning_rate)
        best_epochs.append(best_epoch)
        losses.append(best_loss)
        wers.append(wer)
        smooth_bleu4s.append(smooth_bleu4)
        smooth_bleu3s.append(smooth_bleu3)
        smooth_bleu2s.append(smooth_bleu2)
        smooth_bleu1s.append(smooth_bleu1)
    return best_epochs, losses, wers, smooth_bleu4s, smooth_bleu3s, smooth_bleu2s, smooth_bleu1s


In [6]:
#Evaluira modele za razlicite vrednosti latent_dim i dropout_rate
#U 3d matrici cuva rezultate, treca dimenzija predstavlja vrednosti za razlicite foldove, uprosecavanjem se dobija zeljena metrika
#Isto cuva i broj epoha do konvergencije 
def cv_grid_search(df, dropout_rates, latent_dims, epochs = 200, learning_rate = 0.0002, folds = 5):
    df_np = df.to_numpy()
    np.random.shuffle(df_np)
    total_size = df_np.shape[0]
    fold_size = total_size/folds
    df_folds = [df_np[int(i*fold_size):int((i+1)*fold_size),] for i in range(folds)]
    
    loss_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    epoch_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    wer_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu4_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu3_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu2_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu1_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    for i in range(len(latent_dims)):
        for j in range(len(dropout_rates)):
            best_epochs, losses, wers, smooth_bleu4s, smooth_bleu3s, smooth_bleu2s, smooth_bleu1s = cv_evaluate(df_folds = df_folds, folds = folds, epochs = epochs, learning_rate = learning_rate, latent_dim = latent_dims[i], dropout_rate = dropout_rates[j])
            print(losses)
            print(best_epochs)
            loss_matrix[i,j,:] = losses
            epoch_matrix[i,j,:] = best_epochs
            wer_matrix[i,j,:] = wers
            smooth_bleu4_matrix[i,j,:] = smooth_bleu4s
            smooth_bleu3_matrix[i,j,:] = smooth_bleu3s
            smooth_bleu2_matrix[i,j,:] = smooth_bleu2s
            smooth_bleu1_matrix[i,j,:] = smooth_bleu1s
    #Pakuju se rezultati u dictionary radi intuitivnijeg poziva funkcije
    metrics_dict = {'loss': loss_matrix, 'epoch': epoch_matrix, 'wer': wer_matrix, 'smooth_bleu4': smooth_bleu4_matrix, 'smooth_bleu3': smooth_bleu3_matrix, 'smooth_bleu2': smooth_bleu2_matrix, 'smooth_bleu1': smooth_bleu1_matrix }
    return metrics_dict

In [7]:
df_train = pd.read_csv('data/PHOENIX-2014-T.train.corpus.csv', sep='|')
df_train = df_train.drop(columns=['name','video','start','end','speaker'])
train_size = df_train.shape[0]
#Orth je glossovana recenica, translation je originalna engleska

df_val = pd.read_csv('data/PHOENIX-2014-T.dev.corpus.csv', sep = '|')
df_val.drop(columns = ['name', 'video', 'start', 'end', 'speaker'], inplace = True)
val_size = df_val.shape[0]

df_test = pd.read_csv('data/PHOENIX-2014-T.test.corpus.csv', sep = '|')
df_test.drop(columns = ['name', 'video', 'start', 'end', 'speaker'], inplace = True)
test_size = df_test.shape[0]

df_train_val = pd.concat([df_train, df_val])
df_full = pd.concat([df_train_val, df_test])

In [8]:
#Hiperparametri za optimizaciju: dropout rate i latentna dimenzija
dropout_rates = [0.5, 0.6, 0.7, 0.8, 0.9]
latent_dims = [256, 512, 1024] #Treba probati i vecu latentnu dimenziju i dropout rate, posto optimalna vrednost ispada najveca
learning_rate = 0.0002
folds = 5

In [9]:
#Izvrsava se grid search nad hiperparametrima, i radi se unakrsna validacija za evaluaciju performansi
#Metrika nad kojom se vrsi selekcija je smooth BLEU4

metrics = cv_grid_search(df_train_val, dropout_rates, latent_dims, epochs = 200, learning_rate = learning_rate, folds = folds)
average_bleu4 = np.mean(metrics['smooth_bleu4'], axis = -1)

plt.title('Average smooth BLEU4, crossvalidated')
plt.xlabel('Dropout Rate')
plt.xticks(range(len(dropout_rates)), dropout_rates)
plt.ylabel('Latent Dim')
plt.yticks(range(len(latent_dims)),latent_dims)
plt.imshow(average_bleu4)
plt.colorbar()
plt.show()



Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 1/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.


2023-09-14 11:35:53.214843: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 11:35:53.215025: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 11:35:53.215105: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Epoch 1/200


2023-09-14 11:36:07.432629: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT8
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_41/output/_22'
2023-09-14 11:36:07.608365: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2023-09-14 11:36:07.658976: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-09-14 11:36:08.46826

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200


Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 75: early stopping
Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 2/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200


Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 76: early stopping
Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 3/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200


Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 76: early stopping
Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 4/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 80: early stopping
Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 5/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200


Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 84: early stopping
[2.469913959503174, 2.4958345890045166, 2.469052791595459, 2.5213451385498047, 2.4956765174865723]
[65, 66, 66, 70, 74]
Current Latent Dim: 256
Current Dropout Rate:  0.6
Current Fold: 1/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Epoch 1/200
Epoch 2/200

KeyboardInterrupt: 

In [None]:
best_config_index = np.unravel_index(np.argmax(average_bleu4), average_bleu4.shape)
best_latent_dim = latent_dims[best_config_index[0]]
best_dropout_rate = dropout_rates[best_config_index[1]]
print("Best latent dimension: ", best_latent_dim)
print("Best dropout rate: ", best_dropout_rate)

In [None]:
learning_rate_multipliers = [0.03, 0.1, 0.3, 1, 3, 10, 30] #Treba probati i vece vrednosti, posto 3 ispada optimalno (ovo je neocekivano)
#learning_rate_multipliers = [100]
best_epoch_array = []
bleu4_array = []
for i in range(len(learning_rate_multipliers)):
    df_np = df_train_val.to_numpy()
    np.random.shuffle(df_np)
    total_size = df_np.shape[0]
    fold_size = total_size/folds
    df_folds = [df_np[int(i*fold_size):int((i+1)*fold_size),] for i in range(folds)]
    best_epochs, _, _, smooth_bleu4s, _, _, _ = cv_evaluate(df_train_val, df_folds, folds = folds, epochs = 200, learning_rate = learning_rate, embedding_learning_rate = learning_rate_multipliers[i]*learning_rate, latent_dim = best_latent_dim, dropout_rate = best_dropout_rate)
    best_epoch_array.append(best_epochs)
    bleu4_array.append(smooth_bleu4s)

bleu4_array = np.array(bleu4_array)
best_epoch_array = np.array(best_epoch_array)
best_multiplier_index = np.argmax(np.mean(bleu4_array, -1))
bleu4_avg = np.mean(bleu4_array[best_multiplier_index])
print("Average BLEU4(smooth) on validation: ", bleu4_avg)
best_multiplier = learning_rate_multipliers[best_multiplier_index]
print("Best Multiplier for Embedding Learning Rates:", best_multiplier)
best_multiplier_epochs = best_epoch_array[best_multiplier_index]
print('Epochs to convergence on all folds: ', best_multiplier_epochs) #Gledamo koliko je epoha bilo potrebno do konvergencije
epoch_avg = np.mean(best_multiplier_epochs)#Prosek koristimo za broj epoha treniranja modela na trening i validacionom skupu

In [None]:
plt.title('Average smooth BLEU4 for different embedding learning rates')
plt.xlabel('Learning Rate Multiplier')
plt.yticks([])
plt.xticks(range(len(learning_rate_multipliers)), learning_rate_multipliers)
plt.imshow(np.mean(bleu4_array, axis = 1).reshape(1,-1))
plt.colorbar()
plt.show()

In [None]:
df_train_val_np = df_train_val.to_numpy()
np.random.shuffle(df_train_val_np)
split_size = 300 #Mozda treba vise
train_data = df_train_val_np[:train_size + val_size - split_size,]
val_data = df_train_val_np[train_size + val_size - split_size:,]
input_texts, target_texts = clean_texts(train_data[:,1], train_data[:,0])
input_word_index, target_word_index, max_input_seq_len, max_target_seq_len = analyse_texts(input_texts, target_texts)
input_pad_len = 80
target_pad_len = 60
num_input_words = len(input_word_index) - 1
num_target_words = len(target_word_index) - 1
inverted_input_word_index = {value: key for key,value in input_word_index.items()}
inverted_target_word_index = {value: key for (key,value) in target_word_index.items()}
input_embedding_matrix, target_embedding_matrix = load_embedding_data_get_matrices(inverted_input_word_index, inverted_target_word_index)
encoder_input_data, decoder_input_data, decoder_output_data = create_model_data(input_texts, target_texts, input_word_index, target_word_index, input_pad_len, target_pad_len)

input_texts_val, target_texts_val = clean_texts(val_data[:,1], val_data[:,0])
encoder_input_data_val, decoder_input_data_val, decoder_output_data_val = create_model_data(input_texts_val, target_texts_val, input_word_index, target_word_index, input_pad_len, target_pad_len)

input_texts_test, target_texts_test = clean_texts_df(df_test)
encoder_input_data_test, decoder_input_data_test, decoder_output_data_test = create_model_data(input_texts_test, target_texts_test, input_word_index, target_word_index, input_pad_len, target_pad_len)

In [None]:
model_for_evaluation = GRU_Translation_Model(num_input_words, num_target_words, input_embedding_matrix, target_embedding_matrix)

other_layers = model_for_evaluation.layers[0].layers + model_for_evaluation.layers[1].layers #Mora da se prilagodi za transformer
embedding_layers = [other_layers.pop(2), other_layers.pop(-9)] #Paznja! Mora se prilagoditi svaki put kad se model menja
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers = [(Adam(learning_rate), other_layers), (Adam(best_multiplier*learning_rate), embedding_layers)])
model_for_evaluation.compile(optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['acc'])

#stajalo start_from_epoch = int(epoch_avg*0.7), ali iz nekog razloga ne prepoznaje argument
early_stopping_safe = EarlyStopping(patience = 20, restore_best_weights = True, monitor = 'val_loss', mode = 'min', verbose = 1)
history = model_for_evaluation.fit([encoder_input_data, decoder_input_data], decoder_output_data, validation_data = ([encoder_input_data_val, decoder_input_data_val], decoder_output_data_val), epochs = 200, batch_size = 128, verbose = 1, callbacks = [early_stopping_safe])
model_for_evaluation.summary()
epoch_counter = range(len(history.history['loss']))
fig, (ax1, ax2) = plt.subplots(2,1)
ax1.plot(epoch_counter, history.history['loss'], label = 'Train Loss')
ax1.plot(epoch_counter, history.history['val_loss'], label = 'Validation Loss', linestyle = 'dashed')
ax1.legend()
ax2.plot(epoch_counter, history.history['acc'], label = 'Train accuracy')
ax2.plot(epoch_counter, history.history['val_acc'], label = 'Validation Accuracy', linestyle = 'dashed')
ax2.legend()

In [None]:
wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1 = evaluate(model_for_evaluation, input_texts_test, target_texts_test, input_word_index, target_word_index, inverted_target_word_index, input_pad_len, target_pad_len)
print('Results on test data:')
print('Word Error Rate: ', wer) #76
print('BLEU4(smooth): ', smooth_bleu4) #11.7
print('BLEU3(smooth): ', smooth_bleu3) #17.1
print('BLEU2(smooth): ', smooth_bleu2) #25.2
print('BLEU1(smooth): ', smooth_bleu1) #36