In [None]:
#@title Initial config for notebooks { display-mode: "form" }
import os
from getpass import getpass
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = input('Select the GPU ID to work or -1 to CPU: ')

# Auto detecting Colab or server:
if os.getcwd() == "/content":
    os.environ["SERVER"] = "0"
else:
    os.environ["SERVER"] = "1"

if int(os.getenv('SERVER')):
    !git pull
else:
    # Install required libraries for Colab
    !pip -q install -U nltk==3.4.5
    # Import the encoder function to URL's
    import urllib.parse
    # Delete folders in /content/
    for folder in os.listdir('/content/'):
        if folder == "drive":
            raise ValueError('You have the drive folder mounted, reset the '
                'the machine to fabric state to work again.')
        else:
            os.system("rm -rf /content/"+folder)
    # User credentials
    os.environ["USER"] = input('Github username: ')
    os.environ["PASS"] = urllib.parse.quote(getpass('Password: '))
    # Py archives
    !git clone "https://$USER:$PASS@github.com/JefeLitman/SignLanguageTranslation_SLT.git" .
#DatasetsLoaderUtils
!wget -q https://raw.githubusercontent.com/JefeLitman/VideoDataGenerator/master/DatasetsLoaderUtils.py -O DatasetsLoaderUtils.py
!mv DatasetsLoaderUtils.py utils/DatasetsLoaderUtils.py

# Imports

## Libraries

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from IPython.display import display, clear_output
from ipywidgets import interact, IntSlider
import numpy as np
import random
import math
%matplotlib inline
import matplotlib.pyplot as plt
from collections import namedtuple
import tensorflow as tf
import nltk

from utils.preprocess_data import preprocessing_paths, preprocessing_sentences, table_paths_dataset
from utils.DatasetsLoaderUtils import flow_from_tablePaths
from utils.results import save_predictions, calculate_metrics_results
from metrics.losses import SparseCategoricalCrossentropy_mask
from metrics.accuracy import real_acc
from models import compute_features, encoder, decoder, reduce_features, self_attentions, st_attentions, output

In [None]:
if os.getenv("CUDA_VISIBLE_DEVICES") != '-1':
    gpus = tf.config.experimental.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(gpus[0], True)
tf.debugging.set_log_device_placement(False)

## Data

In [None]:
if not bool(os.getenv('SERVER')):
    from utils import download_data
    download_data.boston201()
    download_data.embedding_word_vectors()
    # Mount drive to save models and results
    from google.colab import drive
    drive.mount('/content/drive')

# Model SLT

## Network parameters

In [None]:
Args = namedtuple('Args', 'max_len_sentence data pretrained prefetch_batch_buffer unitsEmbedding vocab_size nIters videos_path rnnUnits dropout recurrent_dropout inputShape optimizer type_frames batchSize epochs lr momentum decay wDecay path2save name')

args = Args(max_len_sentence=15,
            videos_path='../DataSets/boston201',#'/content/data/boston201',
            rnnUnits=256,
            unitsEmbedding=300,
            vocab_size=150,
            dropout=0.2,
            recurrent_dropout=0.2,
            inputShape=(32, 112, 112, 3),
            pretrained=None,#'vgg16',
            optimizer='adam',
            type_frames='jpg/',
            batchSize=2,
            epochs=20,
            nIters=10.0,
            lr=0.001,
            momentum=0.99,
            decay=0.1,
            wDecay=0.0005,
            path2save='../Saved_Models/', #'/content/drive/My Drive/Models/SLT/<experiment_folder>'
            name='SLT_Model',
            data= '../DataSets/boston201/data/', #'/content/data/boston201/data/',
            prefetch_batch_buffer = -1
           )

## Preparing data

### Setting the seeds for replicability

In [None]:
random.seed(8128)
np.random.seed(8128)
tf.random.set_seed(8128)

### Initial data

In [None]:
paths_translation = [args.data+'translations.train',  
                         args.data+'translations.test']
paths_videos = [args.data+'pathsigns.train', 
                    args.data+'pathsigns.test']

# Processing sentences and paths
preprocessed_sentences, vocab = preprocessing_sentences(paths_translation, max_len=args.max_len_sentence)
preprocessed_paths = preprocessing_paths(paths_videos, path2videos=args.videos_path, type_=args.type_frames)

#Creating table paths
table_paths=table_paths_dataset(preprocessed_paths, preprocessed_sentences)

### Building tf.data.Dataset and data pipeline

In [None]:
#@title Callbacks for methods { display-mode: "form" }
from utils.data_augmentation import frame_sampling

def train_gen_sampling():
    train_gen = raw_data.data_generator(1, args.inputShape[-1])
    for v, l in train_gen:
        s = np.r_[[int(j) for j in (raw_data.to_class[l]).split(", ")]]
        for new_v in frame_sampling(v, args.inputShape[0]):
            yield (new_v, s[:-1]), s[1:]

def test_gen_sampling():
    test_gen = raw_data.data_generator(2, args.inputShape[-1])
    for v, l in test_gen:
        s = np.r_[[int(j) for j in (raw_data.to_class[l]).split(", ")]]
        for new_v in frame_sampling(v, args.inputShape[0]):
            yield (new_v, s[:-1]), s[1:]

def scale_0_255(data, label):
    return (data[0]/255., data[1]), label

def scale_minus1_1(data, label):
    return ((data[0]- 127.5) / 127.5, data[1]), label

In [None]:
raw_data = flow_from_tablePaths(table_paths, lambda x: x, args.inputShape[1:3])

train_data = tf.data.Dataset.from_generator(train_gen_sampling, ((tf.float32, tf.int64), tf.int64),
    ((args.inputShape, args.max_len_sentence-1), args.max_len_sentence-1))

train_data = train_data.cache().map(scale_0_255, -1)
train_data = train_data.shuffle(318, reshuffle_each_iteration=True).batch(args.batchSize)
train_data = train_data.prefetch(args.prefetch_batch_buffer)

test_data = tf.data.Dataset.from_generator(test_gen_sampling, ((tf.float32, tf.int64), tf.int64),
    ((args.inputShape, args.max_len_sentence-1), args.max_len_sentence-1))

test_data = test_data.cache().map(scale_0_255, -1)
test_data = test_data.shuffle(84, reshuffle_each_iteration=True).batch(args.batchSize).prefetch(args.prefetch_batch_buffer)

train_data, test_data

## Network instance

In [None]:
# Entradas del encoder
input_video = tf.keras.Input(shape=args.inputShape, name="input_video")

# Compute features and reduce features
x = compute_features.compute_features_v1_0(input_video, weight_decay=tf.keras.regularizers.l2(args.wDecay))
x = reduce_features.reduce_features_v1_2(x)

# Encoder module and self attention
x1, lstm1_enc_h, lstm1_enc_c, lstm2_enc_h, lstm2_enc_c = encoder.encoder_v1_1(x, args.rnnUnits, 
    args.unitsEmbedding, args.dropout, args.recurrent_dropout)
x1 = self_attentions.self_attention_v1_0(x1)

# Modelo encoder
encoder = tf.keras.Model(inputs=[input_video], 
    outputs=[x, x1, lstm1_enc_h, lstm1_enc_c, lstm2_enc_h, lstm2_enc_c], 
    name="SLT_encoder")

# Entradas del decoder
input_word = tf.keras.Input(shape=[1], name="input_word") # Entra palabra por palabra
input_x = tf.keras.Input(shape=x.shape[1:], name="input_feat_enc") # Entrada de las reduce features del encoder
input_x1 = tf.keras.Input(shape=x1.shape[1:], name="input_sea_enc") # Entrada de la self attention del encoder
input_lstm1_h = tf.keras.Input(shape=[args.rnnUnits], name="input_lstm1_h")
input_lstm1_c = tf.keras.Input(shape=[args.rnnUnits], name="input_lstm1_c")
input_lstm2_h = tf.keras.Input(shape=[args.rnnUnits], name="input_lstm2_h")
input_lstm2_c = tf.keras.Input(shape=[args.rnnUnits], name="input_lstm2_c")

# Decoder module
x2, lstm1_dec_h, lstm1_dec_c, lstm2_dec_h, lstm2_dec_c = decoder.decoder_v1_0(input_word, input_lstm1_h, 
    input_lstm1_c, input_lstm2_h, input_lstm2_c, args.rnnUnits, args.unitsEmbedding, 
    args.vocab_size, args.dropout, args.recurrent_dropout)

# Spatio Temporal attention
x3 = st_attentions.st_attention_v1_4_1(x2, input_x1, input_x)

# Output of the network
x_final = output.output_v1_0(x2, x3, args.vocab_size)

decoder = tf.keras.Model(inputs=[input_word, input_lstm1_h, input_lstm1_c, input_lstm2_h, input_lstm2_c, 
    input_x, input_x1], 
    outputs=[x_final, lstm1_dec_h, lstm1_dec_c, lstm2_dec_h, lstm2_dec_c], 
    name="SLT_decoder")

In [None]:
tf.keras.utils.plot_model(decoder, to_file='SLT_decoder.png', show_shapes=True)

## Training

### Optimizer and metrics

In [None]:
# Optimizer
if args.optimizer == 'adam':
    opt = tf.keras.optimizers.Adam(
        lr=args.lr, 
        beta_1=0.9, 
        beta_2=0.999, 
        epsilon=1e-08, 
        decay=0.0, 
        clipnorm=1., 
        clipvalue=5)

elif args.optimizer == 'sgd':
    opt = tf.keras.optimizers.SGD(
        lr=args.lr, 
        decay=0, 
        momentum=args.momentum, 
        nesterov=True, 
        clipnorm=1., 
        clipvalue=0.5)

elif args.optimizer == 'rsmprop':
    opt = tf.keras.optimizers.RMSprop(lr=args.lr) 
                         #clipnorm=1., 
                         #clipvalue=0.5)      
else:
    raise ValueError('You must specify a valid optimizer for model. The only optmizers available are: '
                    '"adam", "sgd" or "rmsprop". The optmizer given was: '+str(args.optimizer))
# Metrics
loss=SparseCategoricalCrossentropy_mask
#acc = tf.keras.metrics.SparseCategoricalAccuracy(name="acc")
acc = real_acc

### Train and eval step

In [None]:
# The encoder, decoder models must be defined before hand with that names.
# In the same way opt, loss, acc and acc_mask are defined before hand with that names.
# Finally vocab must be define before hand too for eval step

def train_step(video, sentence, target):
    """Function to make a train step with encoder and decoder models.
    Args:
        video: The video batched to insert in encoder model.
        sentence: The sentence batched to insert in the decoder model word by word.
        target: The objetive batched sentence to predict with the model.
    """
    # Set the model in training phase
    tf.keras.backend.set_learning_phase(True)
    sentence_loss = 0
    predictions = []
    with tf.GradientTape() as tape:
        red_feat, enc_output, lstm1_h, lstm1_c, lstm2_h, lstm2_c = encoder(video)
        for word_idx in tf.range(0, sentence.shape[1]):
            prediction, lstm1_h, lstm1_c, lstm2_h, lstm2_c = decoder([tf.expand_dims(sentence[:,word_idx], axis=1), 
                lstm1_h, lstm1_c, lstm2_h, lstm2_c, red_feat, enc_output])

            sentence_loss += loss(target[:, word_idx], prediction)
            predictions.append(prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(sentence_loss, variables)
    opt.apply_gradients(zip(gradients, variables))
    
    return sentence_loss, tf.concat(predictions, axis=1)

def eval_step(video):
    """Function to make a eval step with encoder and decoder models.
    Args:
        video: The video batched to insert in encoder model.
    """
    # Set the model in training phase
    tf.keras.backend.set_learning_phase(False)
    predictions = []
    red_feat, enc_output, lstm1_h, lstm1_c, lstm2_h, lstm2_c = encoder(video)
    word = vocab.word_index['<s>']
    for _ in tf.range(0, args.max_len_sentence-1):
        prediction, lstm1_h, lstm1_c, lstm2_h, lstm2_c = decoder([tf.reshape(word, [1,1]), 
            lstm1_h, lstm1_c, lstm2_h, lstm2_c, red_feat, enc_output])

        word = tf.squeeze(tf.argmax(prediction, axis=1))
        predictions.append(word)
        if tf.equal(vocab.index_word[word], '</s>'):
            break
    
    return tf.concat(predictions, axis=1)

### Train and save the model

In [None]:
# model.compile(optimizer=opt, loss=SparseCategoricalCrossentropy_mask, metrics=[real_acc,
#     tf.keras.metrics.SparseCategoricalAccuracy(name="acc")])

In [None]:
for epoch in range(args.epochs):
    # Train phase
    step = 1
    for xy, y_true in train_data.take(1):
        batch_loss, batch_predictions = train_step(xy[0], xy[1], y_true)
        
        clear_output(wait=True)
        print("Step:", step*args.batchSize, "Learning rate:", opt.lr.numpy())
        print("Epoch:", epoch+1, "Train batch:", step)
        print("Train_Loss: ",batch_loss.numpy())
        print("Train_Accuracy: ",acc(y_true, batch_predictions).numpy())
        step += 1

In [None]:
batch_predictions = train_step(xy[0], xy[1], y_true)[1]

In [None]:
acc(y_true, tf.concat(batch_predictions, axis=1)).numpy()

In [None]:
pred_indexes = tf.argmax(batch_predictions, axis=-1)
    
mask = tf.math.logical_not(tf.math.equal(y_true, 0))
mask = tf.cast(mask, dtype=pred_indexes.dtype)

pred_indexes = tf.multiply(pred_indexes, mask)

equals = tf.math.equal(tf.cast(y_true, pred_indexes.dtype), pred_indexes)
# return tf.math.reduce_mean(tf.cast(equals, tf.float32))

In [None]:
mask

In [None]:
tf.argmax(batch_predictions, axis=-1)

In [None]:
tf.argmax(tf.concat(batch_predictions, axis=1), axis=-1)

In [None]:
a = decoder.get_layer("dense_qk_sta").output
b = decoder.get_layer("input_feat_enc").output
a, b

In [None]:
tf.multiply(a, b)

In [None]:
encoder.save(os.path.join(args.path2save, "trained_encoder.h5"), include_optimizer=False)
decoder.save(os.path.join(args.path2save, "trained_decider.h5"), include_optimizer=False)

### Fine-Tuning (Optional to the data)

In [None]:
# The Boston dataset doesn't have dev dision so there is not finetuning

## Results

In [None]:
# Generator for result to work (This generators are needed to compability)
def train_gen_sampling():
    i = 0
    train_gen = raw_data.data_generator(1, args.inputShape[-1])
    train = table_paths[table_paths[:,1] == "train"]
    
    for v, l in train_gen:
        s = np.r_[[int(j) for j in (raw_data.to_class[l]).split(", ")]]
        p = train[i][0]
        i += 1
        for new_v in [frame_sampling(v, args.inputShape[0])[0]]:
            yield new_v, s, p
train_data = tf.data.Dataset.from_generator(train_gen_sampling, (tf.float32, tf.int64, tf.string)).batch(1)

def test_gen_sampling():
    i = 0
    test_gen = raw_data.data_generator(2, args.inputShape[-1])
    test = table_paths[table_paths[:,1] == "test"]
    for v, l in test_gen:
        s = np.r_[[int(j) for j in (raw_data.to_class[l]).split(", ")]]
        p = test[i][0]
        i += 1
        for new_v in [frame_sampling(v, args.inputShape[0])[0]]:
            yield new_v, s, p
test_data = tf.data.Dataset.from_generator(test_gen_sampling, (tf.float32, tf.int64, tf.string)).batch(1)

In [None]:
results = save_predictions(model, args.path2save, vocab, args, train_data, test_data)

In [None]:
nltk.download('wordnet')
calculate_metrics_results(results)

______

In [None]:
#@title Upload your changes { display-mode: "form" }
if not int(os.getenv('SERVER')):
    !git config --global user.email "$USER@github.com"
    !git config --global user.name "$USER"
!git add -A *
os.environ["COMMIT"] = input("Insert the name for your changes: ")
!git commit -m  "$COMMIT"
!git push