In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

from tensorflow import keras
from keras.models import Model
from keras import layers
from keras.layers import *
from keras.utils import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
# from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras import backend 
from tensorflow.keras import utils

from keras_nlp.layers import SinePositionEncoding

import pydot

from funkcije import *

In [29]:
checkpoint = ModelCheckpoint('model_weights_{epoch}.h5', save_best_only=False, save_weights_only=True, monitor='val_loss', mode='min')
#Ckeckpoint se vise ne koristi
early_stopping = EarlyStopping(patience = 10, restore_best_weights = True, monitor = 'val_loss', mode = 'min', verbose = 1)

embedding_size = 300

In [30]:
class Transformer_Translation_Model(Model):
    def __init__(self, num_input_words, num_target_words, input_embedding_matrix, target_embedding_matrix, latent_dim = 256, dropout_rate = 0.5, custom_dropout_rate = 0.05):
        super(Transformer_Translation_Model, self).__init__()
        
        self.latent_dim = latent_dim
        self.dropout_rate = dropout_rate
        self.custom_dropout_rate = custom_dropout_rate
        self.num_input_words = num_input_words
        self.num_target_words = num_target_words
        self.input_embedding_matrix = input_embedding_matrix
        self.target_embedding_matrix = target_embedding_matrix
        self.embedding_size = 300
        self.input_pad_len = 80
        self.target_pad_len = 60
        
        encoder_inputs = Input(shape=(self.input_pad_len,))
        x = CustomDropout(1.0, custom_dropout_rate)(encoder_inputs)
        # x = PositionalEmbedding(input_pad_len, num_input_words + 1, embedding_size)(encoder_inputs)
        encoder_embedding = Embedding(input_dim = num_input_words + 1, output_dim = embedding_size, mask_zero = True, weights = [input_embedding_matrix], trainable = False)(x)
        encoder_pos_encoding = SinePositionEncoding()(encoder_embedding)
        x = encoder_embedding + encoder_pos_encoding
        # encoder_outputs = TransformerEncoder(embedding_size, latent_dim, num_heads)(x)
        for i in range(num_transformer_layers):
            xt = MultiHeadAttention(num_heads=num_heads, key_dim=self.embedding_size)(x, x, x)
            x = LayerNormalization()(x + xt)
            xt = Dense(self.latent_dim, activation="relu") (x)
            xt = Dense(self.embedding_size) (xt)
            x = LayerNormalization() (x + xt)
        encoder_outputs = x
        self.encoder = keras.Model(encoder_inputs, encoder_outputs)
        
        decoder_inputs = Input(shape=(self.target_pad_len,))
        encoded_seq_inputs = Input(shape=(self.input_pad_len, self.embedding_size))
        # x = PositionalEmbedding(target_pad_len, num_target_words + 1, embedding_size)(decoder_inputs)
        decoder_embedding = Embedding(input_dim = num_input_words + 1, output_dim = self.embedding_size, mask_zero = True, trainable = True)(decoder_inputs)
        decoder_pos_encoding = SinePositionEncoding()(decoder_embedding)
        x = decoder_embedding + decoder_pos_encoding
        # x = TransformerDecoder(embedding_size, latent_dim, num_heads)(x, encoded_seq_inputs)
        for i in range(num_transformer_layers):
            causal_mask = self.get_causal_attention_mask(x)
            xt = MultiHeadAttention(num_heads=num_heads, key_dim=self.embedding_size) (x, x, x, attention_mask=causal_mask)
            x = LayerNormalization() (x + xt)
            xt = MultiHeadAttention(num_heads=num_heads, key_dim=self.embedding_size) (x, encoded_seq_inputs, encoded_seq_inputs)
            x = LayerNormalization() (x + xt)
            xt = Dense(latent_dim, activation="relu") (x)
            xt = Dense(self.embedding_size) (xt)
            x = LayerNormalization() (x + xt)
        x = layers.Dropout(self.dropout_rate)(x)
        decoder_outputs = layers.Dense(num_target_words + 1, activation="softmax")(x)
        self.decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
        
        decoder_outputs = self.decoder([decoder_inputs, encoder_outputs])
        self.transformer = keras.Model(
            [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
        )
        print(self.transformer.summary(expand_nested=True))
        utils.plot_model(self.transformer, show_shapes=True, expand_nested=True)
        
    def call(self, x):
        return self.transformer([x[0], x[1]])
        
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)
    
    def translate(self, encoder_input, decoder_input):
        return self.transformer([encoder_input, decoder_input])

In [31]:
#Trenira model na train_data i evaluira ga na val_data
#Embedding learning rate je poseban learning_rate koji se koristi u embedding slojevima, iz razloga sto oni vec imaju pretrenirane podatke za pocetne vrednosti
#Model se trenira dok val_loss ne krene da raste, i cuva tezine epohe koja ima najbolji val_loss
#Koristi se u cv_evaluate, napravio funkciju jer inace dolazi do prekoracenje GPU RAMa, neko je napisao da je do unakrsne validacije
def train_and_evaluate(train_data, val_data, epochs = 200, batch_size = 128, learning_rate = 0.001, latent_dim = 256, dropout_rate = 0.5, embedding_learning_rate = 0.001):
     
     input_texts, target_texts = clean_texts(train_data.iloc[:,1], train_data.iloc[:,0])
     input_word_index, target_word_index, max_input_seq_len, max_target_seq_len = analyse_texts(input_texts, target_texts)
     input_pad_len = 80
     target_pad_len = 60
     num_input_words = len(input_word_index) - 1
     num_target_words = len(target_word_index) - 1
     #print(num_input_words)
     inverted_input_word_index = {value: key for key,value in input_word_index.items()}
     inverted_target_word_index = {value: key for (key,value) in target_word_index.items()}
     #print(len(inverted_input_word_index))
     input_embedding_matrix, target_embedding_matrix = load_embedding_data_get_matrices(inverted_input_word_index, inverted_target_word_index)
     print('Embeddings loaded.')
     encoder_input_data, decoder_input_data, decoder_output_data = create_model_data(input_texts, target_texts, input_word_index, target_word_index, input_pad_len, target_pad_len)
     #print(input_embedding_matrix.shape)
     
     input_texts_val, target_texts_val = clean_texts(val_data.iloc[:,1], val_data.iloc[:,0])
     encoder_input_data_val, decoder_input_data_val, decoder_output_data_val = create_model_data(input_texts_val, target_texts_val, input_word_index, target_word_index, input_pad_len, target_pad_len)
     
     #print('Data preprocessed.')
     model_transformer = Transformer_Translation_Model(num_input_words, num_target_words, input_embedding_matrix, target_embedding_matrix, latent_dim = latent_dim, dropout_rate = dropout_rate)
     #print('Model loaded.')
     other_layers = model_transformer.layers #Mora da se prilagodi za transformer
     embedding_layers = [] #Paznja! Mora se prilagoditi svaki put kad se model menja

     optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers = [(Adam(learning_rate), other_layers), (Adam(embedding_learning_rate), embedding_layers)])
     model_transformer.compile(optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['acc'])
     #print('Model compiled.')
     history = model_transformer.fit([encoder_input_data, decoder_input_data], decoder_output_data, validation_data = ([encoder_input_data_val, decoder_input_data_val], decoder_output_data_val), epochs = epochs, batch_size = batch_size, callbacks = [early_stopping], verbose = 1)
     #print('Model fit.')
     best_epoch = np.argmin(history.history['val_loss']) + 1
     
     #print('Best epoch: ', best_epoch)
     best_loss = np.min(history.history['val_loss'])
     #print('Best loss:', best_loss)
     #print(model_transformer.evaluate([encoder_input_data_val, decoder_input_data_val], decoder_output_data_val))
     
     wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1 = evaluate(model_transformer, input_texts_val, target_texts_val, input_word_index, target_word_index, inverted_target_word_index, input_pad_len, target_pad_len)
     return best_epoch, best_loss, wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1
 

In [32]:
#Trenira po model za svaki fold, racuna WER, smooth BLEU(1,2,3,4), kao i val_loss i broj epoha do konvergencije
#Vraca podatke iz svake instance modela, odnosno za svaki fold, da bi se dalje procesirale
def cv_evaluate(train_val_data = None, df_folds = None, folds = 5, epochs = 200, batch_size = 128, learning_rate = 0.001, latent_dim = 256, dropout_rate = 0.5, embedding_learning_rate = None):
    if embedding_learning_rate == None:
        embedding_learning_rate = learning_rate
    if df_folds == None:
        df_np = train_val_data.to_numpy()
        np.random.shuffle(df_np)
        total_size = df_np.shape[0]
        fold_size = total_size/folds
        df_folds = [df_np[int(i*fold_size):int((i+1)*fold_size),] for i in range(folds)]
    #input_word_embeddings, target_word_embeddings = load_embedding_data() #Doslo je do prekoracenje memorije
    losses = []
    best_epochs = []
    wers = []
    smooth_bleu1s = []
    smooth_bleu2s = []
    smooth_bleu3s = []
    smooth_bleu4s = []
    for i in range(folds):
        train_folds = [fold for j, fold in enumerate(df_folds) if j!=i]
        train_folds_pd = [pd.DataFrame(data = fold) for fold in train_folds]
        train_data = pd.concat(train_folds_pd)
        val_data = pd.DataFrame(df_folds[i])
        print('Current Latent Dim:', latent_dim)
        print('Current Dropout Rate: ', dropout_rate)
        print('Current Fold: {}/{}'.format(i+1, folds))
        print('Current Learning Rate: ', learning_rate)
        print('Current Learning Rate Multiplier: ', embedding_learning_rate/learning_rate)
        
        best_epoch, best_loss, wer, smooth_bleu4, smooth_bleu3, smooth_bleu2, smooth_bleu1 = train_and_evaluate(train_data, val_data, epochs = epochs, batch_size = batch_size, learning_rate = learning_rate, latent_dim = latent_dim, dropout_rate = dropout_rate, embedding_learning_rate = embedding_learning_rate)
        best_epochs.append(best_epoch)
        losses.append(best_loss)
        wers.append(wer)
        smooth_bleu4s.append(smooth_bleu4)
        smooth_bleu3s.append(smooth_bleu3)
        smooth_bleu2s.append(smooth_bleu2)
        smooth_bleu1s.append(smooth_bleu1)
    return best_epochs, losses, wers, smooth_bleu4s, smooth_bleu3s, smooth_bleu2s, smooth_bleu1s


In [33]:
#Evaluira modele za razlicite vrednosti latent_dim i dropout_rate
#U 3d matrici cuva rezultate, treca dimenzija predstavlja vrednosti za razlicite foldove, uprosecavanjem se dobija zeljena metrika
#Isto cuva i broj epoha do konvergencije 
def cv_grid_search(df, dropout_rates, latent_dims, epochs = 200, learning_rate = 0.0002, folds = 5):
    df_np = df.to_numpy()
    np.random.shuffle(df_np)
    total_size = df_np.shape[0]
    fold_size = total_size/folds
    df_folds = [df_np[int(i*fold_size):int((i+1)*fold_size),] for i in range(folds)]
    
    loss_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    epoch_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    wer_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu4_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu3_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu2_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    smooth_bleu1_matrix = np.zeros((len(latent_dims),len(dropout_rates), folds))
    for i in range(len(latent_dims)):
        for j in range(len(dropout_rates)):
            best_epochs, losses, wers, smooth_bleu4s, smooth_bleu3s, smooth_bleu2s, smooth_bleu1s = cv_evaluate(df_folds = df_folds, folds = folds, epochs = epochs, learning_rate = learning_rate, latent_dim = latent_dims[i], dropout_rate = dropout_rates[j])
            print(losses)
            print(best_epochs)
            loss_matrix[i,j,:] = losses
            epoch_matrix[i,j,:] = best_epochs
            wer_matrix[i,j,:] = wers
            smooth_bleu4_matrix[i,j,:] = smooth_bleu4s
            smooth_bleu3_matrix[i,j,:] = smooth_bleu3s
            smooth_bleu2_matrix[i,j,:] = smooth_bleu2s
            smooth_bleu1_matrix[i,j,:] = smooth_bleu1s
    #Pakuju se rezultati u dictionary radi intuitivnijeg poziva funkcije
    metrics_dict = {'loss': loss_matrix, 'epoch': epoch_matrix, 'wer': wer_matrix, 'smooth_bleu4': smooth_bleu4_matrix, 'smooth_bleu3': smooth_bleu3_matrix, 'smooth_bleu2': smooth_bleu2_matrix, 'smooth_bleu1': smooth_bleu1_matrix }
    return metrics_dict

In [34]:
df_train = pd.read_csv('data/PHOENIX-2014-T.train.corpus.csv', sep='|')
df_train = df_train.drop(columns=['name','video','start','end','speaker'])
train_size = df_train.shape[0]
#Orth je glossovana recenica, translation je originalna engleska

df_val = pd.read_csv('data/PHOENIX-2014-T.dev.corpus.csv', sep = '|')
df_val.drop(columns = ['name', 'video', 'start', 'end', 'speaker'], inplace = True)
val_size = df_val.shape[0]

df_test = pd.read_csv('data/PHOENIX-2014-T.test.corpus.csv', sep = '|')
df_test.drop(columns = ['name', 'video', 'start', 'end', 'speaker'], inplace = True)
test_size = df_test.shape[0]

df_train_val = pd.concat([df_train, df_val])
df_full = pd.concat([df_train_val, df_test])

In [35]:
#Hiperparametri za optimizaciju: dropout rate i latentna dimenzija
dropout_rates = [0.5, 0.6, 0.7, 0.8, 0.9]
latent_dims = [256, 512, 1024] #Treba probati i vecu latentnu dimenziju i dropout rate, posto optimalna vrednost ispada najveca
learning_rate = 0.0002
folds = 5

num_heads = 8
num_transformer_layers = 1

In [36]:
#Izvrsava se grid search nad hiperparametrima, i radi se unakrsna validacija za evaluaciju performansi
#Metrika nad kojom se vrsi selekcija je smooth BLEU4

metrics = cv_grid_search(df_train_val, dropout_rates, latent_dims, epochs = 200, learning_rate = learning_rate, folds = folds)
average_bleu4 = np.mean(metrics['smooth_bleu4'], axis = -1)

plt.title('Average smooth BLEU4, crossvalidated')
plt.xlabel('Dropout Rate')
plt.xticks(range(len(dropout_rates)), dropout_rates)
plt.ylabel('Latent Dim')
plt.yticks(range(len(latent_dims)),latent_dims)
plt.imshow(average_bleu4)
plt.colorbar()
plt.show()

Current Latent Dim: 256
Current Dropout Rate:  0.5
Current Fold: 1/5
Current Learning Rate:  0.0002
Current Learning Rate Multiplier:  1.0
Embeddings loaded.
Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 80)]                 0         []                            
                                                                                                  
 custom_dropout_1 (CustomDr  (None, 80)                   0         ['input_4[0][0]']             
 opout)                                                                                           
                                                                                                  
 embedding_2 (Embedding)     (None, 80, 300)              588000    ['custom_dropout_1[0][0]']    
                             

| 8 (SlicingOpLambda)                                                                            |
|                                                                                                |
| tf.__operators__.getitem_  ()                           0         []                           |
| 9 (SlicingOpLambda)                                                                            |
|                                                                                                |
| tf.expand_dims_1 (TFOpLam  (1,)                         0         []                           |
| bda)                                                                                           |
|                                                                                                |
| tf.reshape_1 (TFOpLambda)  (1, 60, 60)                  0         []                           |
|                                                                                                |
| tf.conca

2023-09-14 12:10:06.009442: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-09-14 12:10:06.064503: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2023-09-14 12:10:06.086892: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f4019871a30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-14 12:10:06.086907: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3080 Ti, Compute Capability 8.6
2023-09-14 12:10:06.089859: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-14 12:10:06.187361: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of th

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 43: early stopping


2023-09-14 12:12:19.318423: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.09GiB (rounded to 1169664000)requested by op Einsum
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-09-14 12:12:19.318449: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-09-14 12:12:19.318455: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 48, Chunks in use: 48. 12.0KiB allocated for chunks. 12.0KiB in use in bin. 240B client-requested in use in bin.
2023-09-14 12:12:19.318458: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-09-14 12:12:19.318462: I tensorflow/tsl/framework/bfc_allocator.cc:10

ResourceExhaustedError: Exception encountered when calling layer 'multi_head_attention_3' (type MultiHeadAttention).

{{function_node __wrapped__Einsum_N_2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[1523,8,80,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Einsum] name: 

Call arguments received by layer 'multi_head_attention_3' (type MultiHeadAttention):
  • query=tf.Tensor(shape=(1523, 80, 300), dtype=float32)
  • value=tf.Tensor(shape=(1523, 80, 300), dtype=float32)
  • key=tf.Tensor(shape=(1523, 80, 300), dtype=float32)
  • attention_mask=None
  • return_attention_scores=False
  • training=None
  • use_causal_mask=False

rflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3eba3b1700 of size 2880000 next 89
2023-09-14 12:12:19.318934: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3eba670900 of size 2880000 next 91
2023-09-14 12:12:19.318936: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3eba92fb00 of size 2880000 next 93
2023-09-14 12:12:19.318940: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3ebabeed00 of size 2880000 next 97
2023-09-14 12:12:19.318942: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3ebaeadf00 of size 2880000 next 99
2023-09-14 12:12:19.318944: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3ebb16d100 of size 2880000 next 101
2023-09-14 12:12:19.318946: I tensorflow/tsl/framework/bfc_allocator.cc:1095] Free  at 7f3ebb42c300 of size 2880000 next 103
2023-09-14 12:12:19.318948: I tensorflow/tsl/framework/bfc_allocator.cc:1095] InUse at 7f3ebb6eb500 of size 2880000 next 123
2023-09-14 12:12:19.318949: I tensorflow