In [1]:
import os
import librosa
import numpy as np
from numpy.random import seed
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from tqdm import tqdm
import warnings
from keras.utils.all_utils import plot_model, to_categorical
from keras.utils.np_utils  import normalize
import random
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant


warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

seed(555)
tf.random.set_seed(555)

data_dir = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/'
input_data = data_dir + 'melspectrograms.pickle'
arousal_labels = data_dir + 'arousal.pickle'
valence_labels = data_dir + 'valence.pickle'
input_text = data_dir + "text.pickle"
tokenizer = data_dir + "tokenizer_file.pickle"

sr = 44100
dur = 4

In [2]:
def split_data(data_x, text_data_x, arousal_y, valence_y):

    
    train_x, test_x, train_text_x, test_text_x, \
        train_arousal_y, test_arousal_y, train_valence_y, test_valence_y = train_test_split(data_x, text_data_x, 
                                                                                            arousal_y, valence_y,
                                                                                            train_size=0.7, shuffle=True)

    print("Training audio data shape : ", train_x.shape)
    print("Training text data shape : ", train_text_x.shape)

    print("Testing audio data shape : ", test_x.shape)
    print("Testing text data shape : ", test_text_x.shape)

    print("Arousal Training data shape : ", train_arousal_y.shape)
    print("Arousal Testing data shape : ", test_arousal_y.shape)
    
    print("Valence Training data shape : ", train_valence_y.shape)
    print("Valence Testing data shape : ", test_valence_y.shape)
    
    return train_x, train_text_x, test_x, test_text_x, train_arousal_y, test_arousal_y, train_valence_y, test_valence_y

In [3]:
data_x = np.load(input_data, allow_pickle=True)
data_text_x = np.load(input_text, allow_pickle=True)
arousal_y = np.load(arousal_labels, allow_pickle=True)
valence_y = np.load(valence_labels, allow_pickle=True)
tokenizer = np.load(tokenizer, allow_pickle=True)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 300
max_text_len = 20

train_x, train_text_x, test_x, test_text_x, train_arousal_y,\
    test_arousal_y, train_valence_y, test_valence_y = split_data(data_x, data_text_x, arousal_y, valence_y)

Training audio data shape :  (2398, 128, 345, 1)
Training text data shape :  (2398, 1, 20)
Testing audio data shape :  (1029, 128, 345, 1)
Testing text data shape :  (1029, 1, 20)
Arousal Training data shape :  (2398, 1)
Arousal Testing data shape :  (1029, 1)
Valence Training data shape :  (2398, 1)
Valence Testing data shape :  (1029, 1)


In [4]:
def split_list(a_list):
    half = len(a_list)//2
    return a_list[:half], a_list[half:]

In [5]:
# Concordance correlation coefficient (CCC)-based loss function - using non-inductive statistics
def ccc(gold, pred):
    gold       = K.squeeze(gold, axis=-1)
    pred       = K.squeeze(pred, axis=-1)
    gold_mean  = K.mean(gold, axis=-1, keepdims=True)
    pred_mean  = K.mean(pred, axis=-1, keepdims=True)
    covariance = (gold-gold_mean)*(pred-pred_mean)
    gold_var   = K.mean(K.square(gold-gold_mean), axis=-1,  keepdims=True)
    pred_var   = K.mean(K.square(pred-pred_mean), axis=-1, keepdims=True)
    ccc        = K.constant(2.) * covariance / (gold_var + pred_var + K.square(gold_mean - pred_mean) + K.epsilon())
    return ccc


def ccc_loss(gold, pred):  
    # input (num_batches, seq_len, 1)
    ccc_loss   = K.constant(1.) - ccc(gold, pred)

    return ccc_loss

def get_embedding_matrix(vocab_size, word_index, embedding_dim):
    path_to_glove_file = '/home/mabikana/Documents/PhD/SER Code/EmergencyOutcomePrediction/cc.fr.300.vec'

    embeddings_index = {}
    with open(path_to_glove_file, encoding="utf8") as f:
        for line in f:
            splited_line = line.split(" ")
            word = splited_line[0]
            coefs = ' '.join(splited_line[1:])
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))

    num_tokens = vocab_size 
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    #Converted 6791 words (869 misses)
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [6]:
def create_model(input_shape):

    audio_in = tf.keras.layers.Input(shape=input_shape, dtype='float32')

    #LFLB1
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same',
                            input_shape=input_shape)(audio_in)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)(model)


    #LFLB2
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)
    
    #LFLB3
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)

     #LFLB4
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)
    model = tf.keras.layers.Reshape((-1, 128))(model)

    #LSTM
    model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False))(model) 

    
    # Output
    target_names = ('v', 'a')
    model_combined = [tf.keras.layers.Dense(1, name=name)(model) for name in target_names]
    
    model = tf.keras.Model(audio_in, model_combined) 
    opt = tf.keras.optimizers.Adam(learning_rate=0.0006, decay=1e-6)
    
    model.compile(loss=ccc_loss, optimizer=opt, metrics=[ccc])

#     model.summary()
    
    plot_model(
        model, to_file=data_dir + 'model.png', show_shapes=False, 
        show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96
    )

    return model

In [7]:
def create_multimodal_model(input_shape, max_text_len, embedding_size, vocab_size, word_index):

    audio_in = tf.keras.layers.Input(shape=input_shape, dtype='float32')

    #LFLB1
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same',
                            input_shape=input_shape)(audio_in)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)(model)


    #LFLB2
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)
    
    #LFLB3
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)

     #LFLB4
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)
    model = tf.keras.layers.Reshape((-1, 128))(model)

    #LSTM
    model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False))(model) 
    

    inputs_text = tf.keras.layers.Input(shape=(max_text_len, ), dtype='float32')

    net = tf.keras.layers.Embedding(vocab_size,
                    embedding_size,
                    input_length=max_text_len,
                    embeddings_initializer=Constant(get_embedding_matrix(vocab_size, word_index, embedding_size)),
                    trainable = False
                  )(inputs_text)
    net = tf.keras.layers.LSTM(256, return_sequences=True)(net)
    net = tf.keras.layers.LSTM(256, return_sequences=True)(net)
    net = tf.keras.layers.LSTM(256, return_sequences=False)(net)
    net = tf.keras.layers.Dropout(0.3)(net)

    final_model = tf.keras.layers.concatenate([model, net])

    
    # Output
    target_names = ('v', 'a')
    model_combined = [tf.keras.layers.Dense(1, name=name)(final_model) for name in target_names]
    
    model = tf.keras.Model([audio_in, inputs_text], model_combined) 
    opt = tf.keras.optimizers.Adam(learning_rate=0.0006, decay=1e-6)
    
    model.compile(loss=ccc_loss, optimizer=opt, metrics=[ccc])

#     model.summary()
    
    plot_model(
        model, to_file=data_dir + 'model.png', show_shapes=False, 
        show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96
    )

    return model

In [8]:
def create_cross_attention_model(input_shape, rnn_speech,rnn_text,hidden_lstm_speech,
                                    hidden_con,hidden_lstm_text,hidden_dim, max_text_len,
                                    vocab_size, embedding_size, word_index):
   
    ##### Speech BiLSTM
    speech_input = tf.keras.layers.Input(shape=input_shape, dtype='float32', name='input_speech')
    
    #LFLB1
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same',
                            input_shape=input_shape)(speech_input)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)(model)


    #LFLB2
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)

    #LFLB3
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)

     #LFLB4
    model = tf.keras.layers.Conv2D(filters=128,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=4, strides=4)(model)
   
     #LFLB5
    model = tf.keras.layers.Conv2D(filters=64,
                            kernel_size=3,
                            strides=1,
                            padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.Activation('elu')(model)
    model = tf.keras.layers.MaxPooling2D(pool_size=1, strides=1)(model)

    model = tf.keras.layers.Reshape((-1, 128))(model)

    
    speech_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_lstm_speech,return_sequences=True))(model)
    speech_att   = tf.keras.layers.Dense(hidden_con, activation='tanh')(speech_layer)
    
    speech_att = tf.keras.layers.Reshape((1, hidden_con))(speech_att)
    speech_att_source= np.zeros((len(rnn_speech),hidden_con))
    speech_att_input = tf.keras.layers.Input(shape=(hidden_con,),dtype='float32')
    speech_att_vec   = tf.keras.layers.Dense(hidden_con, activation='relu')(speech_att_input)
    speech_att_vec   = tf.keras.layers.Lambda(lambda x: K.batch_dot(*x, axes=(1,2)))([speech_att_vec,speech_att])
    speech_att_vec   = tf.keras.layers.Dense(len(rnn_speech[0]),activation='softmax')(speech_att_vec)
    speech_att_vec   = tf.keras.layers.Reshape((len(rnn_speech[0]),1))(speech_att_vec)
    speech_output= tf.keras.layers.multiply([speech_att_vec,speech_layer])
    speech_output= tf.keras.layers.Lambda(lambda x: K.sum(x, axis=1))(speech_output)
    speech_output= tf.keras.layers.Dense(hidden_dim, activation='relu')(speech_output)

    ##### Text BiLSTM
    ##### The attention source is simply the final hidden layer, not the weight summed sequence
    ##### This kind of implementation was done empirically, upon the performance
    

    text_input = tf.keras.layers.Input(shape=(max_text_len, ), dtype='float32', name='input_text')
    net = tf.keras.layers.Embedding(
                    input_dim=vocab_size,
                    output_dim = embedding_size,
                    input_length=max_text_len,
                    embeddings_initializer=Constant(get_embedding_matrix(vocab_size, word_index, embedding_size)),
                    trainable = False
                  )(text_input)

    net = tf.keras.layers.LSTM(256, return_sequences=True)(net)
    net = tf.keras.layers.LSTM(256, return_sequences=True)(net)
    net = tf.keras.layers.LSTM(256, return_sequences=True)(net)
    net = tf.keras.layers.Dropout(0.3)(net)

    text_fw, text_fw_h, text_fw_c = tf.keras.layers.LSTM(hidden_lstm_text,  return_sequences=True, return_state=True,)(net)
    text_bw, text_bw_h, text_bw_c = tf.keras.layers.LSTM(hidden_lstm_text,  return_sequences=True,go_backwards=True, return_state=True,)(net)
    text_layer = tf.keras.layers.concatenate([text_fw,text_bw])
    text_final = tf.keras.layers.concatenate([text_fw_h,text_bw_h])
    text_att = tf.keras.layers.Dense(hidden_con, activation='tanh')(text_layer)	
    text_att_source = np.zeros((len(rnn_text),hidden_con, 1))              # Dummy code
    text_att_input  = tf.keras.layers.Input(shape=(hidden_con, 1), dtype='float32')      # Dummy code	

    ##### Exchange phase
    speech_att_hop = tf.keras.layers.Dense(hidden_con, activation='relu')(text_final)	
    speech_att_hop = tf.keras.layers.Lambda(lambda x: K.batch_dot(*x, axes=(1,2)))([speech_att_hop,speech_att])
    speech_att_hop = tf.keras.layers.Dense(len(rnn_speech[0]),activation='softmax')(speech_att_hop)
    speech_att_hop = tf.keras.layers.Reshape((len(rnn_speech[0]),1))(speech_att_hop)	
    speech_output_hop = tf.keras.layers.multiply([speech_att_hop,speech_layer])  # Text-influenced attention for audio
    speech_output_hop = tf.keras.layers.Lambda(lambda x: K.sum(x, axis=1))(speech_output_hop)
    speech_output_hop = tf.keras.layers.Dense(hidden_dim, activation='relu')(speech_output_hop)
    text_att_hop = tf.keras.layers.Dense(hidden_con, activation='relu')(speech_output)	
    text_att_hop = tf.keras.layers.Lambda(lambda x: K.batch_dot(*x, axes=(1,2)))([text_att_hop,text_att])
    text_att_hop = tf.keras.layers.Dense(len(rnn_text[0]),activation='softmax')(text_att_hop)
    text_att_hop = tf.keras.layers.Reshape((len(rnn_text[0]),1))(text_att_hop)	
    text_output_hop = tf.keras.layers.multiply([text_att_hop,text_layer])        # Audio-influenced attention for text
    text_output_hop = tf.keras.layers.Lambda(lambda x: K.sum(x, axis=1))(text_output_hop)
    text_output_hop = tf.keras.layers.Dense(hidden_dim, activation='relu')(text_output_hop)	
    
    ##### Total output
    output    = tf.keras.layers.concatenate([speech_output_hop, text_output_hop])
    output    = tf.keras.layers.Dense(hidden_dim, activation='relu')(output)
    output    = tf.keras.layers.Dropout(0.3)(output)
    output    = tf.keras.layers.Dense(hidden_dim, activation='relu')(output)
    output    = tf.keras.layers.Dropout(0.3)(output)
    target_names = ('v', 'a')
    main_output = [tf.keras.layers.Dense(1, name=name)(output) for name in target_names]
    
    final_model = tf.keras.Sequential()
    final_model = tf.keras.Model([speech_input, speech_att_input, text_input, text_att_input], main_output)

    opt = tf.keras.optimizers.Adam(learning_rate=0.0004, decay=1e-6)

    final_model.compile(loss=ccc_loss,optimizer=opt,metrics=[ccc])

#     final_model.summary()
    plot_model(
        final_model, to_file=data_dir + 'cross_attention_model.png', show_shapes=False, 
        show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96
    )

    return final_model, speech_att_source, text_att_source


In [9]:
def train_multimodal_model(train_x, train_text_x, validation_x, validation_text_x, train_arousal_y, validation_arousal_y, train_valence_y, validation_valence_y, model):
    
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                mode='min',
                verbose=0,
                patience=10,
                restore_best_weights=True)

    mc = tf.keras.callbacks.ModelCheckpoint(data_dir + 'model.h5',
                            monitor='val_loss',
                            mode='min',
                            verbose=1,
                            save_best_only=True)

    model.fit([train_x, train_text_x], [train_valence_y, train_arousal_y],   
              validation_data=([validation_x, validation_text_x], [validation_valence_y, validation_arousal_y]),
              epochs=100,
              batch_size=16,
              verbose=2,
              callbacks=[es, mc],
              shuffle=True)
      
    return tf.keras.models.load_model(data_dir + 'model.h5', custom_objects={"ccc_loss": ccc_loss, "ccc": ccc})
    


In [10]:
def train_model(train_x, validation_x, train_arousal_y, validation_arousal_y, train_valence_y, validation_valence_y, model):
    
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                mode='min',
                verbose=0,
                patience=10,
                restore_best_weights=True)

    mc = tf.keras.callbacks.ModelCheckpoint(data_dir + 'model.h5',
                            monitor='val_loss',
                            mode='min',
                            verbose=1,
                            save_best_only=True)

    model.fit([train_x], [train_valence_y, train_arousal_y],   
              validation_data=([validation_x], [validation_valence_y, validation_arousal_y]),
              epochs=100,
              batch_size=16,
              verbose=2,
              callbacks=[es, mc],
              shuffle=True)
      
    return tf.keras.models.load_model(data_dir + 'model.h5', custom_objects={"ccc_loss": ccc_loss, "ccc": ccc})
    


In [11]:
def train_cross_attention_model(train_x, speech_att_source, train_text_x, text_att_source, 
                                validation_x, speech_att_source_val, validation_text_x, text_att_source_val,
                                train_arousal_y, validation_arousal_y, train_valence_y, validation_valence_y, model):
    

    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                mode='min',
                verbose=0,
                patience=10,
                restore_best_weights=True)

    mc = tf.keras.callbacks.ModelCheckpoint(data_dir + 'cross_attention_model_weights.ckpt',
                            monitor='val_loss',
                            mode='min',
                            verbose=1,
                            save_weights_only=True,
                            save_best_only=True)

    original_model = model
    
    model.fit([train_x, speech_att_source, train_text_x, text_att_source], [train_valence_y, train_arousal_y],   
              validation_data=([validation_x, speech_att_source_val, validation_text_x, text_att_source_val], 
                               [validation_valence_y, validation_arousal_y]),
              epochs=100,
              batch_size=16,
              verbose=2,
              callbacks=[es, mc],
              shuffle=True)
      
    return original_model.load_weights(data_dir + 'cross_attention_model_weights.ckpt')
    

In [12]:
def evaluate_multimodal_model(model, test_x, test_train_x, test_valence_y, test_arousal_y):
    return model.evaluate([test_x, test_train_x], 
                         [test_valence_y, test_arousal_y],
                         verbose=0)
    
def evaluate_model(model, test_x, test_valence_y, test_arousal_y):
    return model.evaluate([test_x], 
                         [test_valence_y, test_arousal_y],
                         verbose=0)

def evaluate_cross_attention_model(model, test_x, speech_att_source_test, test_train_x, text_att_source_test, test_valence_y, test_arousal_y):
    return model.evaluate([test_x, speech_att_source_test, test_train_x, text_att_source_test], 
                         [test_valence_y, test_arousal_y],
                         verbose=0)

In [None]:
validation_x, test_x = split_list(test_x)
validation_text_x, test_text_x = split_list(test_text_x)
validation_arousal_y, test_arousal_y = split_list(test_arousal_y)
validation_valence_y, test_valence_y = split_list(test_valence_y)

# new_train_text = train_text_x
# new_validation_text = validation_text_x
# new_test_text = test_text_x
new_train_text = train_text_x.reshape(len(train_text_x), max_text_len)
new_validation_text = validation_text_x.reshape(len(validation_text_x), max_text_len)
new_test_text = test_text_x.reshape(len(test_text_x), max_text_len)

model_architecture = create_model((128, 345, 1))

print("Training on ", len(train_x), " samples")
print("Validating on ", len(validation_x), " samples")
print("Testing on ", len(validation_x), " samples")

model = train_model(train_x, validation_x, train_arousal_y, validation_arousal_y, train_valence_y, validation_valence_y, model_architecture)


Training on  2398  samples
Validating on  514  samples
Testing on  514  samples
Epoch 1/100

Epoch 1: val_loss improved from inf to 1.99564, saving model to /home/mabikana/Documents/PhD/SER Code/RECOLA Processing/model.h5
150/150 - 14s - loss: 1.0598 - v_loss: 0.7311 - a_loss: 0.3287 - v_ccc: 0.2689 - a_ccc: 0.6713 - val_loss: 1.9956 - val_v_loss: 0.9925 - val_a_loss: 1.0031 - val_v_ccc: 0.0075 - val_a_ccc: -3.0880e-03 - 14s/epoch - 92ms/step
Epoch 2/100

Epoch 2: val_loss did not improve from 1.99564
150/150 - 6s - loss: 0.9473 - v_loss: 0.6640 - a_loss: 0.2833 - v_ccc: 0.3360 - a_ccc: 0.7167 - val_loss: 2.0119 - val_v_loss: 0.9959 - val_a_loss: 1.0161 - val_v_ccc: 0.0041 - val_a_ccc: -1.6054e-02 - 6s/epoch - 40ms/step
Epoch 3/100

Epoch 3: val_loss improved from 1.99564 to 1.86875, saving model to /home/mabikana/Documents/PhD/SER Code/RECOLA Processing/model.h5
150/150 - 6s - loss: 0.8393 - v_loss: 0.5687 - a_loss: 0.2707 - v_ccc: 0.4313 - a_ccc: 0.7293 - val_loss: 1.8688 - val_v_los

In [None]:
scores = evaluate_model(model, test_x, test_valence_y, test_arousal_y)
print("loss = ", scores[0], ", valence_loss = ", scores[1], ", arousal_loss = ", scores[2], 
     ", \n valence_ccc = ", scores[3], ", arousal_ccc = ", scores[4])

In [None]:
print("Input shape ", train_x.shape)
model_cross_attention_architecture, speech_att_source, text_att_source = create_cross_attention_model((128, 345, 1), train_x, new_train_text, 256,128,256,128, max_text_len, vocab_size, embedding_size, word_index)
model_cross_attention_architecture_val, speech_att_source_val, text_att_source_val = create_cross_attention_model((128, 345, 1), validation_x, new_validation_text, 256,128,256,128, max_text_len, vocab_size, embedding_size, word_index)

print("Training on ", len(train_x), " samples")
print("Validating on ", len(validation_x), " samples")
print("Testing on ", len(validation_x), " samples")


model_cross_attention = train_cross_attention_model(train_x, speech_att_source, new_train_text, text_att_source, 
                                validation_x, speech_att_source_val, new_validation_text, text_att_source_val,
                                train_arousal_y, validation_arousal_y, train_valence_y, validation_valence_y, 
                                model_cross_attention_architecture)


In [None]:
model_cross_attention_architecture_test, speech_att_source_test, text_att_source_test = create_cross_attention_model((128, 345, 1), test_x, new_test_text, 256,128,256,128, max_text_len, vocab_size, embedding_size, word_index)
model_cross_attention_architecture_test.load_weights(data_dir + 'cross_attention_model_weights.ckpt')

scores_cross_attention = evaluate_cross_attention_model(model_cross_attention_architecture_test, test_x, speech_att_source_test, 
                                                        new_test_text, text_att_source_test, 
                                                        test_valence_y, test_arousal_y)


In [None]:
print("loss = ", scores_cross_attention[0], ", valence_loss = ", scores_cross_attention[1], ", arousal_loss = ", scores_cross_attention[2], 
     ", \n valence_ccc = ", scores_cross_attention[3], ", arousal_ccc = ", scores_cross_attention[4])