# Libraries

In [None]:
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, ConfusionMatrixDisplay

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Masking, InputLayer, Conv3D, MaxPooling3D, Flatten, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# Paths

In [None]:
features_audio_input_path = "/kaggle/input/truthlie-audio-dwt-features/audio_20"

features_video_input_path = "/kaggle/input/truth-lie-features"
train_features_path = features_video_input_path + "/train_features.csv"
val_features_path = features_video_input_path + "/val_features.csv"
test_features_path = features_video_input_path + "/test_features.csv"

# define path to save model
model_path = 'binary_model.weights.h5'

# Audio

## Metrics - Audio

In [None]:
# Dictionary for Holdout metrics
metrics_holdout = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}

# Dictionary for Cross-Validation metrics
metrics_cross = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}

## Features - Audio

In [None]:
# Holdout
train_features_audio_holdout = np.load(features_audio_input_path + "/train_features.npy")
val_features_audio_holdout = np.load(features_audio_input_path + "/val_features.npy")
test_features_audio_holdout = np.load(features_audio_input_path + "/test_features.npy")
train_labels_audio_holdout = np.load(features_audio_input_path + "/train_labels.npy")
val_labels_audio_holdout = np.load(features_audio_input_path + "/val_labels.npy")
test_labels_audio_holdout = np.load(features_audio_input_path + "/test_labels.npy")

print(train_features_audio_holdout.shape, train_labels_audio_holdout.shape)
print(val_features_audio_holdout.shape, val_labels_audio_holdout.shape)
print(test_features_audio_holdout.shape, test_labels_audio_holdout.shape)

print("\n")

# Cross-Validation
data_audio_cross = []
for i in range(4):
    fold_features = np.load(features_audio_input_path + f"/fold_{i}_features.npy")
    fold_labels = np.load(features_audio_input_path + f"/fold_{i}_labels.npy")
    print(fold_features.shape, fold_labels.shape)
    data_audio_cross.append((fold_features, fold_labels))

## Model - Audio

In [None]:
def plot_confusion_matrix(conf_matrix):
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Lie', 'Truth'], yticklabels=['Lie', 'Truth'])
    plt.xlabel("Predict")
    plt.ylabel("Real")
    plt.title("Confusion Matrix")
    plt.show()

def lstm_model(input_shape, hidden_size, learning_rate):
    model = Sequential([
        InputLayer(shape=input_shape),
        Masking(mask_value=0.0),
        LSTM(hidden_size, return_sequences=False, use_cudnn=False),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_and_evaluate_lstm_sequences(
    train_features, train_labels,
    val_features, val_labels,
    test_features, test_labels,
    hidden_size=128,
    num_epochs=20,
    batch_size=32,
    learning_rate=0.001,
    save_res=True,
    metrics_dict=None
):
    """
    Trains and evaluates an LSTM model on sequential data.
    
    Parameters:
    - train_features: numpy array of training features (num_samples, time_steps, num_features)
    - train_labels: numpy array of training labels
    - val_features: numpy array of validation features
    - val_labels: numpy array of validation labels
    - test_features: numpy array of test features
    - test_labels: numpy array of test labels
    - hidden_size: size of the LSTM hidden state
    - num_epochs: number of training epochs
    - batch_size: batch size
    - learning_rate: learning rate
    - save_res: whether to display and save training results (True/False)
    
    Returns:
    - test_loss: loss value on the test set
    - test_accuracy: accuracy on the test set
    """
    # Define the LSTM model
    model = lstm_model(input_shape=(train_features.shape[1], train_features.shape[2]),
                       hidden_size=hidden_size, learning_rate=learning_rate)
    
    # Train the model
    history = model.fit(
        train_features, train_labels,
        validation_data=(val_features, val_labels),
        epochs=num_epochs,
        batch_size=batch_size,
        verbose=save_res
    )
    
    # Make predictions on the test set
    test_predictions_proba = model.predict(test_features) 
    test_predictions = (test_predictions_proba > 0.5).astype("int32") 
    
    # Compute classification metrics
    accuracy = accuracy_score(test_labels, test_predictions)
    precision = precision_score(test_labels, test_predictions, zero_division=0, average=None)
    recall = recall_score(test_labels, test_predictions, zero_division=0, average=None)
    f1 = f1_score(test_labels, test_predictions, average=None)
    auc = roc_auc_score(test_labels, test_predictions_proba)
    
    if save_res:        
        # Save metrics to the dictionary
        metrics_dict["Accuracy"].append(accuracy)
        metrics_dict["Precision (0)"].append(precision[0])
        metrics_dict["Precision (1)"].append(precision[1])
        metrics_dict["Recall (0)"].append(recall[0])
        metrics_dict["Recall (1)"].append(recall[1])
        metrics_dict["F1 (0)"].append(f1[0])
        metrics_dict["F1 (1)"].append(f1[1])
        metrics_dict["AUC"].append(auc)
    
        # Confusion Matrix
        conf_matrix = confusion_matrix(test_labels, test_predictions)
        plot_confusion_matrix(conf_matrix)
 
    return accuracy, precision[0], precision[1], recall[0], recall[1], f1[0], f1[1], auc

## Holdout - Audio

In [None]:
# Execute Training and Evaluation
hidden_size = 128
num_epochs = 20
batch_size = 32
learning_rate = 0.001
    
train_and_evaluate_lstm_sequences(
    train_features_audio_holdout,
    train_labels_audio_holdout,
    val_features_audio_holdout,
    val_labels_audio_holdout,
    test_features_audio_holdout,
    test_labels_audio_holdout,
    hidden_size=hidden_size,
    num_epochs=num_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    metrics_dict=metrics_holdout
)

# Dataframe creation
df = pd.DataFrame.from_dict(metrics_holdout, orient='index', columns=["NOISE FILTER + DWT + MFCC + LSTM"])
print(df)
metrics_holdout = {key: [] for key in metrics_holdout}

## Cross-Validation - Audio

In [None]:
# Function for cross-validation
def cross_validation_lstm_sequences(metrics_dict, features_by_fold, num_epochs=50, hidden_size=128, batch_size=32, learning_rate=0.001):
    accuracy_scores = [] 
    precision_scores_0, recall_scores_0, f1_scores_0 = [], [], []
    precision_scores_1, recall_scores_1, f1_scores_1 = [], [], []
    auc_scores = []
    
    for fold_idx in range(4):
        print(f"---------- Fold {fold_idx + 1} ----------")
        
        # Combine data from other folds for training
        train_features, train_labels = [], []
        for i in range(4):
            if i != fold_idx:
                train_features.append(features_by_fold[i][0])
                train_labels.append(features_by_fold[i][1])

        train_features = np.vstack(train_features)
        train_labels = np.hstack(train_labels)
        
        # Process the test fold
        test_features, test_labels = features_by_fold[fold_idx]
        
        # Train and evaluate the LSTM model
        acc, precision_0, precision_1, recall_0, recall_1, f1_0, f1_1, auc = train_and_evaluate_lstm_sequences(
            train_features=train_features,  
            train_labels=train_labels,
            val_features=test_features,  
            val_labels=test_labels,
            test_features=test_features,    
            test_labels=test_labels,        
            hidden_size=hidden_size,
            num_epochs=num_epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            save_res=False  
        )
        
        # Save metrics for each fold
        accuracy_scores.append(acc)
        precision_scores_0.append(precision_0)
        recall_scores_0.append(recall_0)
        f1_scores_0.append(f1_0)
        precision_scores_1.append(precision_1)
        recall_scores_1.append(recall_1)
        f1_scores_1.append(f1_1)
        auc_scores.append(auc)

        print(f"Accuracy: {acc:.2f}")

    # Mean metrics across all folds
    avg_accuracy = np.mean(accuracy_scores)
    avg_precision_0, avg_recall_0, avg_f1_0 = np.mean(precision_scores_0), np.mean(recall_scores_0), np.mean(f1_scores_0)
    avg_precision_1, avg_recall_1, avg_f1_1 = np.mean(precision_scores_1), np.mean(recall_scores_1), np.mean(f1_scores_1)
    avg_auc = np.mean(auc_scores)

    metrics_dict["Mean Accuracy"].append(avg_accuracy)
    metrics_dict["Mean Precision (0)"].append(avg_precision_0)
    metrics_dict["Mean Precision (1)"].append(avg_precision_1)
    metrics_dict["Mean Recall (0)"].append(avg_recall_0)
    metrics_dict["Mean Recall (1)"].append(avg_recall_1)
    metrics_dict["Mean F1 (0)"].append(avg_f1_0)
    metrics_dict["Mean F1 (1)"].append(avg_f1_1)
    metrics_dict["Mean AUC"].append(avg_auc)

    # Standard deviation of metrics across all folds
    std_accuracy = np.std(accuracy_scores)
    std_precision_0, std_recall_0, std_f1_0 = np.std(precision_scores_0), np.std(recall_scores_0), np.std(f1_scores_0)
    std_precision_1, std_recall_1, std_f1_1 = np.std(precision_scores_1), np.std(recall_scores_1), np.std(f1_scores_1)
    std_auc = np.std(auc_scores)
    
    metrics_dict["Std Accuracy"].append(std_accuracy)
    metrics_dict["Std Precision (0)"].append(std_precision_0)
    metrics_dict["Std Precision (1)"].append(std_precision_1)
    metrics_dict["Std Recall (0)"].append(std_recall_0)
    metrics_dict["Std Recall (1)"].append(std_recall_1)
    metrics_dict["Std F1 (0)"].append(std_f1_0)
    metrics_dict["Std F1 (1)"].append(std_f1_1)
    metrics_dict["Std AUC"].append(std_auc)

    print(f"\nAvg Accuracy: {avg_accuracy:.2f}")
    print(f"Std Dev Accuracy: {std_accuracy:.2f}")

In [None]:
cross_validation_lstm_sequences(metrics_cross, data_audio_cross,
                                    num_epochs=20, hidden_size=128, 
                                    batch_size=32, learning_rate=0.001)

# Dataframe creation
df = pd.DataFrame.from_dict(metrics_cross, orient='index', columns=["NOISE FILTER + DWT + MFCC + LSTM"])
print("\n")
print(df)
metrics_cross = {key: [] for key in metrics_cross}

# Video

## Metrics - Video

In [None]:
# Dictionaries for Holdout metrics
frame_metrics_holdout = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_mean = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_majority = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_threshold = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}

# Dictionaries for Cross-Validation metrics
frame_metrics_cross = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_mean = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_majority = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_threshold = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}

# Function to aggregate frames considering the average
def calculate_mean(preds):
    return np.mean(preds)

# Function to aggregate frames considering the majority
def calculate_majority(preds):
    return int(np.sum(preds > 0.5) > len(preds) / 2)

# Function to aggregate frames considering a threshold
def aggregate_by_threshold(preds):
    threshold = 0.25 * len(preds)
    count_zeros = (preds < 0.5).sum()
    return 0 if count_zeros >= threshold else 1

## Features - Video

In [None]:
def upload_dataset(features_path):
    df_feature = pd.read_csv(features_path)
    
    # Remove rows that contain NaN
    dataset = df_feature.dropna()

    return dataset

train_dataset = upload_dataset(train_features_path)
val_dataset = upload_dataset(val_features_path)
test_dataset = upload_dataset(test_features_path)

cross_datasets=[]
for i in range(4):
    fold_features_path = features_video_input_path + f"/fold_{i}_features.csv"
    cross_datasets.append(upload_dataset(fold_features_path))

train_list = train_dataset['input'].unique().tolist()
val_list = val_dataset['input'].unique().tolist()
test_list = test_dataset['input'].unique().tolist()

cross_lists = [fold_dataset['input'].unique().tolist() for fold_dataset in cross_datasets]

In [None]:
# Define coloumns of action units and emotions
aus_cols_aux = list(train_dataset.columns[144:164])
aus_cols_emo = list(train_dataset.columns[164:171])
cols_groups = [     
    aus_cols_aux,
    aus_cols_emo,
]
cols_groups_string = "AUX+EMO"

# Make sure all groups are the same size, otherwise select the most correlated columns
min_features = min(len(group) for group in cols_groups)
cols_groups = [
    train_dataset[group + ['label']].corr()['label'].drop('label')
    .abs().sort_values(ascending=False).head(min_features).index.tolist()
    for group in cols_groups
]

In [None]:
def mirror_padding(frames, sequence_length):
    """
    Applies mirror padding to a sequence of frames if it's shorter than the required sequence length.
    
    :param frames: Numpy array of frames (num_frames, feature_dim)
    :param sequence_length: Desired length of the output sequence
    :return: Numpy array with mirror padding applied
    """    
    # Compute the number of padding frames needed
    padding_needed = sequence_length - len(frames)
    
    # Compute how to split padding between the beginning and the end
    pad_start = padding_needed // 2
    pad_end = padding_needed - pad_start
    
    # Apply mirror (reflect) padding along the time axis
    padded_frames = np.pad(
        frames,
        pad_width=((pad_start, pad_end), (0, 0)),  # Padding along the first dimension (time)
        mode='reflect'
    )
    
    return padded_frames
    

def gen_sequence(id_df, seq_length, cols_groups, frame_step=1, padding=None):
    """
    Generates sequences of fixed length from a dataframe representing a single sample.

    :param id_df: DataFrame containing the data of a single video sample
    :param seq_length: Length of the output sequences
    :param cols_groups: List of lists, each sublist containing column names that belong to a group (e.g., audio, facial)
    :param frame_step: Step between frames when sliding the window
    :param padding: Padding method to apply ('const' for zero-padding, 'mirror' for reflective padding)
    :yield: Tuple of (stacked sequence, label, video_name)
    """
    data_matrices = []
    
    # Extract and preprocess each group of columns
    for group in cols_groups:
        group_df = id_df[group]
        data_matrices.append(group_df.values)

    # Apply padding to each group if needed
    if padding == 'const':
        padded_matrices = []
        for data_matrix in data_matrices:
            if data_matrix.shape[0] < seq_length:
                padding_needed = seq_length - data_matrix.shape[0]
                pre_padding = padding_needed // 3
                post_padding = padding_needed - pre_padding
                pad = np.full((pre_padding, data_matrix.shape[1]), 0)
                data_matrix = np.vstack([pad, data_matrix, np.full((post_padding, data_matrix.shape[1]), 0)])
            padded_matrices.append(data_matrix)
        data_matrices = padded_matrices
    elif padding == 'mirror':
        data_matrices = [mirror_padding(data_matrix, seq_length) if data_matrix.shape[0] < seq_length else data_matrix
                         for data_matrix in data_matrices]

    label = id_df['label'].values[0]
    video_name = id_df['input'].values[0]

    num_elements = data_matrices[0].shape[0]
    for start, stop in zip(range(0, num_elements - seq_length + 1, frame_step), range(seq_length, num_elements + 1, frame_step)):
        # Stack the sequence from all groups along a new axis (group-wise)
        stacked_sequence = np.stack([matrix[start:stop, :] for matrix in data_matrices], axis=1)
        yield stacked_sequence, label, video_name

In [None]:
sequence_length = 64
frame_step = 1
padding = 'const'

# Generate sequences for train, val and test
seq_gen_train = list(list(gen_sequence(train_dataset[train_dataset['input'] == id], 
                                       sequence_length, cols_groups, frame_step, 
                                       padding=None))
                     for id in train_list)

seq_gen_val = list(list(gen_sequence(val_dataset[val_dataset['input'] == id], 
                                     sequence_length, cols_groups, frame_step, 
                                     padding))
                   for id in val_list)

seq_gen_test = list(list(gen_sequence(test_dataset[test_dataset['input'] == id], 
                                      sequence_length, cols_groups, frame_step, 
                                      padding))
                    for id in test_list)

# Remove empty lists
seq_gen_train = [x for x in seq_gen_train if len(x) > 0]
seq_gen_val = [x for x in seq_gen_val if len(x) > 0]
seq_gen_test = [x for x in seq_gen_test if len(x) > 0]

# Extract data from generators
seq_array_train = [[t[0] for t in sublist] for sublist in seq_gen_train]
label_array_train = [[t[1] for t in sublist] for sublist in seq_gen_train]
video_array_train = [[t[2] for t in sublist] for sublist in seq_gen_train]

seq_array_val = [[t[0] for t in sublist] for sublist in seq_gen_val]
label_array_val = [[t[1] for t in sublist] for sublist in seq_gen_val]
video_array_val = [[t[2] for t in sublist] for sublist in seq_gen_val]

seq_array_test = [[t[0] for t in sublist] for sublist in seq_gen_test]
label_array_test = [[t[1] for t in sublist] for sublist in seq_gen_test]
video_array_test = [[t[2] for t in sublist] for sublist in seq_gen_test]

# Transform lists in arrays
seq_array_train = np.concatenate(seq_array_train).astype(np.float32)
label_array_train = np.concatenate(label_array_train).astype(np.float32).reshape(-1)
video_array_train = np.concatenate(video_array_train)

seq_array_val = np.concatenate(seq_array_val).astype(np.float32)
label_array_val = np.concatenate(label_array_val).astype(np.float32).reshape(-1)
video_array_val = np.concatenate(video_array_val)

seq_array_test = np.concatenate(seq_array_test).astype(np.float32)
label_array_test = np.concatenate(label_array_test).astype(np.float32).reshape(-1)
video_array_test = np.concatenate(video_array_test)

# Transpose and expand arrays
seq_array_train = np.transpose(seq_array_train, (0, 1, 3, 2))
seq_array_train = np.expand_dims(seq_array_train, axis=-1)
print(seq_array_train.shape, label_array_train.shape)

seq_array_val = np.transpose(seq_array_val, (0, 1, 3, 2))
seq_array_val = np.expand_dims(seq_array_val, axis=-1)
print(seq_array_val.shape, label_array_val.shape)

seq_array_test = np.transpose(seq_array_test, (0, 1, 3, 2))
seq_array_test = np.expand_dims(seq_array_test, axis=-1)
print(seq_array_test.shape, label_array_test.shape)

In [None]:
cross_sequences, cross_labels, cross_video_names = [],[],[]
cross_sequences_pad, cross_labels_pad, cross_video_names_pad = [],[],[]

for i in range(4):
    fold_dataset = cross_datasets[i]
    fold_list = cross_lists[i]
    
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding=None))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transform lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.expand_dims(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    cross_sequences.append(fold_sequences)
    cross_labels.append(fold_labels)
    cross_video_names.append(fold_video_names)

    # Padding
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transform lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.expand_dims(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    cross_sequences_pad.append(fold_sequences)
    cross_labels_pad.append(fold_labels)
    cross_video_names_pad.append(fold_video_names)

## Model - Video

In [None]:
def create_model_Conv3D(input_shape, learning_rate=1e-4, p=0):
    model = Sequential([
        InputLayer(shape=input_shape),
        
        # Conv3D Block
        Conv3D(filters=32, kernel_size=(3, 3, 2), activation='relu'),
        BatchNormalization(),
        MaxPooling3D(pool_size=(2, 2, 1)),  

        # Flatten e Fully Connected
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  
    ])

    optimizer = Adam(learning_rate=learning_rate)
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    if p:
        print(model.summary())
        
    return model


def plot_history(history):
    # Summarize history for Accuracy
    fig_acc = plt.figure(figsize=(10,3))
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    # Confusion Matrix
    fig, ax = plt.subplots(figsize=(5,5))
    cm = confusion_matrix(y_true, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm)
    
    cm_display.plot(ax=ax)
    plt.show()

## Holdout - Video

In [None]:
input_shape_holdout = seq_array_train.shape[1:]
learning_rate = 1e-4
epochs = 20
batch_size = 256

callbacks = [ModelCheckpoint(model_path, monitor='val_accuracy', mode='max',
                             save_best_only=True, save_weights_only=True)]

model_holdout = create_model_Conv3D(input_shape_holdout, learning_rate)

# Fit the network
history_holdout = model_holdout.fit(seq_array_train, label_array_train, 
                              epochs=epochs, batch_size=batch_size, 
                              validation_data=(seq_array_val, label_array_val),
                              callbacks=callbacks
          )

plot_history(history_holdout)

model_holdout.load_weights(model_path)

# Make predictions
pred = model_holdout.predict(seq_array_test)
y_pred = (pred > 0.5).astype(int)
y_true = label_array_test

# Calculate metrics for frames
frame_accuracy = accuracy_score(y_true, y_pred)
frame_f1 = f1_score(y_true, y_pred, zero_division=0, average=None)
frame_precision = precision_score(y_true, y_pred, zero_division=0, average=None)
frame_recall = recall_score(y_true, y_pred, zero_division=0, average=None)
frame_auc = roc_auc_score(y_true, pred)

frame_metrics_holdout["Accuracy"].append(frame_accuracy)
frame_metrics_holdout["Precision (0)"].append(frame_precision[0])
frame_metrics_holdout["Precision (1)"].append(frame_precision[1])
frame_metrics_holdout["Recall (0)"].append(frame_recall[0])
frame_metrics_holdout["Recall (1)"].append(frame_recall[1])
frame_metrics_holdout["F1 (0)"].append(frame_f1[0])
frame_metrics_holdout["F1 (1)"].append(frame_f1[1])
frame_metrics_holdout["AUC"].append(frame_auc)

print("***** FRAME LEVEL RESULTS *****\n")
plot_confusion_matrix(y_true, y_pred)

# Create a dataframe with predictions and video ids
pred_df = pd.DataFrame({'video_id': video_array_test, 'prediction': pred.flatten(), 'label': label_array_test})
 
# Group by video and calculate the average
video_predictions_mean = pred_df.groupby('video_id')['prediction'].apply(calculate_mean).values
video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)

# Group by video and apply the majority voting rule
video_predictions_binary_majority = pred_df.groupby('video_id')['prediction'].apply(calculate_majority).values

# Group by video and apply the aggregation rule
video_predictions_binary_threshold = pred_df.groupby('video_id')['prediction'].apply(aggregate_by_threshold).values    

# Group by video and get labels
video_labels = pred_df.groupby('video_id')['label'].first().values

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_mean)

video_metrics_holdout_mean["Accuracy"].append(video_accuracy)
video_metrics_holdout_mean["Precision (0)"].append(video_precision[0])
video_metrics_holdout_mean["Precision (1)"].append(video_precision[1])
video_metrics_holdout_mean["Recall (0)"].append(video_recall[0])
video_metrics_holdout_mean["Recall (1)"].append(video_recall[1])
video_metrics_holdout_mean["F1 (0)"].append(video_f1[0])
video_metrics_holdout_mean["F1 (1)"].append(video_f1[1])
video_metrics_holdout_mean["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MEAN) *****\n")
plot_confusion_matrix(video_labels, video_predictions_binary_mean)

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

video_metrics_holdout_majority["Accuracy"].append(video_accuracy)
video_metrics_holdout_majority["Precision (0)"].append(video_precision[0])
video_metrics_holdout_majority["Precision (1)"].append(video_precision[1])
video_metrics_holdout_majority["Recall (0)"].append(video_recall[0])
video_metrics_holdout_majority["Recall (1)"].append(video_recall[1])
video_metrics_holdout_majority["F1 (0)"].append(video_f1[0])
video_metrics_holdout_majority["F1 (1)"].append(video_f1[1])
video_metrics_holdout_majority["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
plot_confusion_matrix(video_labels, video_predictions_binary_majority)

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

video_metrics_holdout_threshold["Accuracy"].append(video_accuracy)
video_metrics_holdout_threshold["Precision (0)"].append(video_precision[0])
video_metrics_holdout_threshold["Precision (1)"].append(video_precision[1])
video_metrics_holdout_threshold["Recall (0)"].append(video_recall[0])
video_metrics_holdout_threshold["Recall (1)"].append(video_recall[1])
video_metrics_holdout_threshold["F1 (0)"].append(video_f1[0])
video_metrics_holdout_threshold["F1 (1)"].append(video_f1[1])
video_metrics_holdout_threshold["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
plot_confusion_matrix(video_labels, video_predictions_binary_threshold)

# Dataframe creation
df = pd.DataFrame.from_dict(frame_metrics_holdout, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_holdout = {key: [] for key in frame_metrics_holdout}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_holdout_mean, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_holdout_mean = {key: [] for key in video_metrics_holdout_mean}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_holdout_majority, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_holdout_majority = {key: [] for key in video_metrics_holdout_majority}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_holdout_threshold, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_holdout_threshold = {key: [] for key in video_metrics_holdout_threshold}

## Cross-Validation - Video

In [None]:
def cross_validation(frame_metrics_dict, video_metrics_dict_mean,
                     video_metrics_dict_majority, video_metrics_dict_threshold,
                     cross_sequences, cross_labels, cross_video_names,
                     cross_sequences_pad, cross_labels_pad, cross_video_names_pad,
                     num_fold=4, num_epoche=30, batch_size=200, learning_rate=1e-4, 
                     create_model=create_model_Conv3D):
    # Performance List Videos
    frame_accuracy_score = [] 
    frame_f1_score = [] 
    frame_precision_score = [] 
    frame_recall_score = []
    frame_auc_score = []
    
    # Performance List Videos
    video_accuracy_score_mean = [] 
    video_f1_score_mean = [] 
    video_precision_score_mean = [] 
    video_recall_score_mean = []
    video_auc_score_mean = []

    # Performance List Videos
    video_accuracy_score_majority = [] 
    video_f1_score_majority = [] 
    video_precision_score_majority = [] 
    video_recall_score_majority = []
    video_auc_score_majority = []

    # Performance List Videos
    video_accuracy_score_threshold = [] 
    video_f1_score_threshold = [] 
    video_precision_score_threshold = [] 
    video_recall_score_threshold = []
    video_auc_score_threshold = []

    # Iteration on the folds of cross-validation
    for fold_idx in range(num_fold):
        print(f"Fold {fold_idx+1}/{num_fold}")
        train_sequences, train_labels, train_video_names = [],[],[]
 
        # File path for the current fold
        for i in range(num_fold):
            if i != fold_idx:
                train_sequences.append(cross_sequences[i])
                train_labels.append(cross_labels[i])
                train_video_names.extend(cross_video_names[i])
        train_sequences = np.vstack(train_sequences)
        train_labels = np.hstack(train_labels)
        
        val_sequences = cross_sequences_pad[fold_idx]
        val_labels = cross_labels_pad[fold_idx]
        val_video_names = cross_video_names_pad[fold_idx]

        input_shape = train_sequences.shape[1:]

        # Create model
        model = create_model(input_shape, learning_rate=learning_rate)

        # Train model
        history = model.fit(train_sequences, train_labels, 
                            validation_data = (val_sequences, val_labels),
                            epochs=num_epoche, batch_size=batch_size, verbose=0)

        plot_history(history)

        # Predictions
        val_predictions = model.predict(val_sequences)
        y_pred = (val_predictions > 0.5).astype(int)
        y_true = val_labels

        plot_confusion_matrix(y_true, y_pred)
 
        # Create a dataframe with predictions and video ids
        pred_df = pd.DataFrame({'video_id': val_video_names, 'prediction': val_predictions.flatten(), 'label': val_labels})
    
        # Group by video and calculate the average
        video_predictions_mean = pred_df.groupby('video_id')['prediction'].apply(calculate_mean).values
        video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)
        
        # Group by video and apply the majority voting rule
        video_predictions_binary_majority = pred_df.groupby('video_id')['prediction'].apply(calculate_majority).values
    
        # Group by video and apply the aggregation rule
        video_predictions_binary_threshold = pred_df.groupby('video_id')['prediction'].apply(aggregate_by_threshold).values    
        
        # Group by video and get labels
        video_labels = pred_df.groupby('video_id')['label'].first().values

        # Calculate metrics for frames
        frame_accuracy = accuracy_score(y_true, y_pred)
        frame_f1 = f1_score(y_true, y_pred, zero_division=0, average=None)
        frame_precision = precision_score(y_true, y_pred, zero_division=0, average=None)
        frame_recall = recall_score(y_true, y_pred, zero_division=0, average=None)
        frame_auc = roc_auc_score(y_true, val_predictions)
        
        frame_accuracy_score.append(frame_accuracy)
        frame_f1_score.append(frame_f1)
        frame_precision_score.append(frame_precision)
        frame_recall_score.append(frame_recall)
        frame_auc_score.append(frame_auc)

        print(f"Accuracy fold (frame-based): {frame_accuracy}")

        # Calculate accuracy for videos
        video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
        video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        video_auc = roc_auc_score(video_labels, video_predictions_mean)

        video_accuracy_score_mean.append(video_accuracy)
        video_f1_score_mean.append(video_f1)
        video_precision_score_mean.append(video_precision)
        video_recall_score_mean.append(video_recall)
        video_auc_score_mean.append(video_auc)

        print(f"Accuracy fold (video-based - mean): {video_accuracy}")

        video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
        video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

        video_accuracy_score_majority.append(video_accuracy)
        video_f1_score_majority.append(video_f1)
        video_precision_score_majority.append(video_precision)
        video_recall_score_majority.append(video_recall)
        video_auc_score_majority.append(video_auc)

        print(f"Accuracy fold (video-based - majority): {video_accuracy}")

        video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
        video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

        video_accuracy_score_threshold.append(video_accuracy)
        video_f1_score_threshold.append(video_f1)
        video_precision_score_threshold.append(video_precision)
        video_recall_score_threshold.append(video_recall)
        video_auc_score_threshold.append(video_auc)

        print(f"Accuracy fold (video-based - threshold): {video_accuracy}")

    # Metrics average on all frames
    frame_avg_accuracy = np.mean(frame_accuracy_score)
    frame_avg_precision_0, frame_avg_recall_0, frame_avg_f1_0 = np.mean(frame_precision_score[0]), np.mean(frame_recall_score[0]), np.mean(frame_f1_score[0])
    frame_avg_precision_1, frame_avg_recall_1, frame_avg_f1_1 = np.mean(frame_precision_score[1]), np.mean(frame_recall_score[1]), np.mean(frame_f1_score[1])
    frame_avg_auc = np.mean(frame_auc_score)

    frame_metrics_dict["Mean Accuracy"].append(frame_avg_accuracy)
    frame_metrics_dict["Mean Precision (0)"].append(frame_avg_precision_0)
    frame_metrics_dict["Mean Precision (1)"].append(frame_avg_precision_1)
    frame_metrics_dict["Mean Recall (0)"].append(frame_avg_recall_0)
    frame_metrics_dict["Mean Recall (1)"].append(frame_avg_recall_1)
    frame_metrics_dict["Mean F1 (0)"].append(frame_avg_f1_0)
    frame_metrics_dict["Mean F1 (1)"].append(frame_avg_f1_1)
    frame_metrics_dict["Mean AUC"].append(frame_avg_auc)

    # Metrics standard deviation on all frames
    frame_std_accuracy = np.std(frame_accuracy_score)
    frame_std_precision_0, frame_std_recall_0, frame_std_f1_0 = np.std(frame_precision_score[0]), np.std(frame_recall_score[0]), np.std(frame_f1_score[0])
    frame_std_precision_1, frame_std_recall_1, frame_std_f1_1 = np.std(frame_precision_score[1]), np.std(frame_recall_score[1]), np.std(frame_f1_score[1])
    frame_std_auc = np.std(frame_auc_score)

    frame_metrics_dict["Std Accuracy"].append(frame_std_accuracy)
    frame_metrics_dict["Std Precision (0)"].append(frame_std_precision_0)
    frame_metrics_dict["Std Precision (1)"].append(frame_std_precision_1)
    frame_metrics_dict["Std Recall (0)"].append(frame_std_recall_0)
    frame_metrics_dict["Std Recall (1)"].append(frame_std_recall_1)
    frame_metrics_dict["Std F1 (0)"].append(frame_std_f1_0)
    frame_metrics_dict["Std F1 (1)"].append(frame_std_f1_1)
    frame_metrics_dict["Std AUC"].append(frame_std_auc)
        
    print("\n\nRESULTS on FRAMES:\n")

    print("Mean Accuracy:", frame_avg_accuracy)
    print("Accuracy std:", frame_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_mean)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_mean[0]), np.mean(video_recall_score_mean[0]), np.mean(video_f1_score_mean[0])
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_mean[1]), np.mean(video_recall_score_mean[1]), np.mean(video_f1_score_mean[1])
    video_avg_auc = np.mean(video_auc_score_mean)

    video_metrics_dict_mean["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_mean["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_mean["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_mean["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_mean["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_mean["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_mean["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_mean["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_mean)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_mean[0]), np.std(video_recall_score_mean[0]), np.std(video_f1_score_mean[0])
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_mean[1]), np.std(video_recall_score_mean[1]), np.std(video_f1_score_mean[1])
    video_std_auc = np.std(video_auc_score_mean)

    video_metrics_dict_mean["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_mean["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_mean["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_mean["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_mean["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_mean["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_mean["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_mean["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MEAN):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_majority)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_majority[0]), np.mean(video_recall_score_majority[0]), np.mean(video_f1_score_majority[0])
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_majority[1]), np.mean(video_recall_score_majority[1]), np.mean(video_f1_score_majority[1])
    video_avg_auc = np.mean(video_auc_score_majority)

    video_metrics_dict_majority["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_majority["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_majority["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_majority["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_majority["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_majority["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_majority["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_majority["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_majority)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_majority[0]), np.std(video_recall_score_majority[0]), np.std(video_f1_score_majority[0])
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_majority[1]), np.std(video_recall_score_majority[1]), np.std(video_f1_score_majority[1])
    video_std_auc = np.std(video_auc_score_majority)

    video_metrics_dict_majority["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_majority["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_majority["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_majority["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_majority["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_majority["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_majority["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_majority["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MAJORITY):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_threshold)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_threshold[0]), np.mean(video_recall_score_threshold[0]), np.mean(video_f1_score_threshold[0])
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_threshold[1]), np.mean(video_recall_score_threshold[1]), np.mean(video_f1_score_threshold[1])
    video_avg_auc = np.mean(video_auc_score_threshold)

    video_metrics_dict_threshold["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_threshold["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_threshold["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_threshold["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_threshold["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_threshold["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_threshold["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_threshold["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_threshold)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_threshold[0]), np.std(video_recall_score_threshold[0]), np.std(video_f1_score_threshold[0])
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_threshold[1]), np.std(video_recall_score_threshold[1]), np.std(video_f1_score_threshold[1])
    video_std_auc = np.std(video_auc_score_threshold)

    video_metrics_dict_threshold["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_threshold["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_threshold["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_threshold["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_threshold["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_threshold["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_threshold["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_threshold["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (THRESHOLD):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

In [None]:
cross_validation(frame_metrics_cross, video_metrics_cross_mean,
                 video_metrics_cross_majority, video_metrics_cross_threshold,
                 cross_sequences, cross_labels, cross_video_names,
                 cross_sequences_pad, cross_labels_pad, cross_video_names_pad,
                 num_fold=4, num_epoche=epochs, batch_size=batch_size, learning_rate=learning_rate, 
                 create_model=create_model_Conv3D)

# Dataframe creation
df = pd.DataFrame.from_dict(frame_metrics_cross, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_cross = {key: [] for key in frame_metrics_cross}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_mean, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_cross_mean = {key: [] for key in video_metrics_cross_mean}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_majority, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_cross_majority = {key: [] for key in video_metrics_cross_majority}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_threshold, orient='index', columns=["SL 64 3D CONV " + cols_groups_string])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_cross_threshold = {key: [] for key in video_metrics_cross_threshold}