# Libraries

In [None]:
import torch
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

import librosa
import pywt
import scipy
from scipy.signal.windows import hamming 
 
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import InputLayer, LSTM, Conv3D, MaxPooling3D, Flatten, Dense, Masking, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support, roc_auc_score, ConfusionMatrixDisplay
from tensorflow.keras.callbacks import ModelCheckpoint

from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Path Video

In [None]:
# define path to save model
model_path = 'binary_model.weights.h5'

input_video_path = "/kaggle/input/truth-lie-features"

train_video_path = input_video_path + "/train_features.csv"
val_video_path = input_video_path + "/val_features.csv"
test_video_path = input_video_path + "/test_features.csv"

# Path Audio

In [None]:
holdout_audio_path = '/kaggle/input/truthlie-clean-split/TruthLie_Holdout_Stratified'
train_audio_path = holdout_audio_path + '/train'
val_audio_path = holdout_audio_path + '/val'
test_audio_path = holdout_audio_path + '/test'

cross_audio_path = '/kaggle/input/truthlie-clean-crossvalidation/TruthLie_CrossVal_Stratified'

# Metrics

In [None]:
# Dictionaries for Holdout metrics
metrics_holdout = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}

frame_metrics_holdout = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_mean = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_majority = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}
video_metrics_holdout_threshold = {
    "Accuracy": [],
    "Precision (0)": [], "Precision (1)": [],
    "Recall (0)": [], "Recall (1)": [],
    "F1 (0)": [], "F1 (1)": [],
    "AUC": []
}

# Dictionaries for Cross-Validation metrics
metrics_cross = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}

frame_metrics_cross = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_mean = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_majority = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}
video_metrics_cross_threshold = {
    "Mean Accuracy": [], "Std Accuracy": [],
    "Mean Precision (0)": [], "Std Precision (0)": [],
    "Mean Precision (1)": [], "Std Precision (1)": [],
    "Mean Recall (0)": [], "Std Recall (0)": [],
    "Mean Recall (1)": [], "Std Recall (1)": [],
    "Mean F1 (0)": [], "Std F1 (0)": [],
    "Mean F1 (1)": [], "Std F1 (1)": [],
    "Mean AUC": [], "Std AUC": []
}

# Function to aggregate frames considering the average
def calculate_mean(preds):
    return np.mean(preds)

# Function to aggregate frames considering the majority
def calculate_majority(preds):
    return int(np.sum(preds > 0.5) > len(preds) / 2)

# Function to aggregate frames considering a threshold
def aggregate_by_threshold(preds):
    threshold = 0.25 * len(preds)
    count_zeros = (preds < 0.5).sum()
    return 0 if count_zeros >= threshold else 1

# Preprocessing 

## VIDEO

In [None]:
def upload_dataset(features_path):
    df_feature = pd.read_csv(features_path)
    
    # Remove rows that contain NaN
    dataset = df_feature.dropna()

    return dataset

def mirror_padding(frames, sequence_length):
    """
    Applies mirror padding to a sequence of frames if it's shorter than the required sequence length.
    
    :param frames: Numpy array of frames (num_frames, feature_dim)
    :param sequence_length: Desired length of the output sequence
    :return: Numpy array with mirror padding applied
    """    
    # Compute the number of padding frames needed
    padding_needed = sequence_length - len(frames)
    
    # Compute how to split padding between the beginning and the end
    pad_start = padding_needed // 2
    pad_end = padding_needed - pad_start
    
    # Apply mirror (reflect) padding along the time axis
    padded_frames = np.pad(
        frames,
        pad_width=((pad_start, pad_end), (0, 0)),  # Padding along the first dimension (time)
        mode='reflect'
    )
    
    return padded_frames
    

def gen_sequence(id_df, seq_length, cols_groups, frame_step=1, padding=None):
    """
    Generates sequences of fixed length from a dataframe representing a single sample.

    :param id_df: DataFrame containing the data of a single video sample
    :param seq_length: Length of the output sequences
    :param cols_groups: List of lists, each sublist containing column names that belong to a group (e.g., audio, facial)
    :param frame_step: Step between frames when sliding the window
    :param padding: Padding method to apply ('const' for zero-padding, 'mirror' for reflective padding)
    :yield: Tuple of (stacked sequence, label, video_name)
    """
    data_matrices = []
    
    # Extract and preprocess each group of columns
    for group in cols_groups:
        group_df = id_df[group]
        data_matrices.append(group_df.values)

    # Apply padding to each group if needed
    if padding == 'const':
        padded_matrices = []
        for data_matrix in data_matrices:
            if data_matrix.shape[0] < seq_length:
                padding_needed = seq_length - data_matrix.shape[0]
                pre_padding = padding_needed // 3
                post_padding = padding_needed - pre_padding
                pad = np.full((pre_padding, data_matrix.shape[1]), 0)
                data_matrix = np.vstack([pad, data_matrix, np.full((post_padding, data_matrix.shape[1]), 0)])
            padded_matrices.append(data_matrix)
        data_matrices = padded_matrices
    elif padding == 'mirror':
        data_matrices = [mirror_padding(data_matrix, seq_length) if data_matrix.shape[0] < seq_length else data_matrix
                         for data_matrix in data_matrices]

    label = id_df['label'].values[0]
    video_name = id_df['input'].values[0]

    num_elements = data_matrices[0].shape[0]
    for start, stop in zip(range(0, num_elements - seq_length + 1, frame_step), range(seq_length, num_elements + 1, frame_step)):
        # Stack the sequence from all groups along a new axis (group-wise)
        stacked_sequence = np.stack([matrix[start:stop, :] for matrix in data_matrices], axis=1)
        yield stacked_sequence, label, video_name

In [None]:
train_dataset = upload_dataset(train_video_path)
val_dataset = upload_dataset(val_video_path)
test_dataset = upload_dataset(test_video_path)

aus_cols_aux = list(train_dataset.columns[144:164])
aus_cols_emotions = list(train_dataset.columns[164:171])

train_list = train_dataset['input'].unique().tolist()
val_list = val_dataset['input'].unique().tolist()
test_list = test_dataset['input'].unique().tolist()

random.shuffle(train_list)
random.shuffle(val_list)
random.shuffle(test_list)

sequence_length = 64
frame_step = 1
cols_groups = [  
    aus_cols_aux,
    aus_cols_emotions
]  
padding = 'const'

# Make sure all groups are the same size, otherwise select the most correlated columns
min_features = min(len(group) for group in cols_groups)
cols_groups = [
    train_dataset[group + ['label']].corr()['label'].drop('label')
    .abs().sort_values(ascending=False).head(min_features).index.tolist()
    for group in cols_groups
]

# Generate sequences for train, val and test
seq_gen_train = list(list(gen_sequence(train_dataset[train_dataset['input'] == id], 
                                       sequence_length, cols_groups, frame_step, 
                                       padding=None))
                     for id in train_list)

seq_gen_val = list(list(gen_sequence(val_dataset[val_dataset['input'] == id], 
                                     sequence_length, cols_groups, frame_step, 
                                     padding))
                   for id in val_list)

seq_gen_test = list(list(gen_sequence(test_dataset[test_dataset['input'] == id], 
                                      sequence_length, cols_groups, frame_step, 
                                      padding))
                    for id in test_list)

# Remove all empty lists
seq_gen_train = [x for x in seq_gen_train if len(x) > 0]
seq_gen_val = [x for x in seq_gen_val if len(x) > 0]
seq_gen_test = [x for x in seq_gen_test if len(x) > 0]

# Extract data from generators
seq_array_train = [[t[0] for t in sublist] for sublist in seq_gen_train]
label_array_train = [[t[1] for t in sublist] for sublist in seq_gen_train]
video_array_train = [[t[2] for t in sublist] for sublist in seq_gen_train]

seq_array_val = [[t[0] for t in sublist] for sublist in seq_gen_val]
label_array_val = [[t[1] for t in sublist] for sublist in seq_gen_val]
video_array_val = [[t[2] for t in sublist] for sublist in seq_gen_val]

seq_array_test = [[t[0] for t in sublist] for sublist in seq_gen_test]
label_array_test = [[t[1] for t in sublist] for sublist in seq_gen_test]
video_array_test = [[t[2] for t in sublist] for sublist in seq_gen_test]

# Transform lists in arrays
seq_array_train = np.concatenate(seq_array_train).astype(np.float32)
train_labels_video_holdout = np.concatenate(label_array_train).astype(np.float32).reshape(-1)
train_video_names_holdout = np.concatenate(video_array_train)

seq_array_val = np.concatenate(seq_array_val).astype(np.float32)
val_labels_video_holdout = np.concatenate(label_array_val).astype(np.float32).reshape(-1)
val_video_names_holdout = np.concatenate(video_array_val)

seq_array_test = np.concatenate(seq_array_test).astype(np.float32)
test_labels_video_holdout = np.concatenate(label_array_test).astype(np.float32).reshape(-1)
test_video_names_holdout = np.concatenate(video_array_test)

# Transpose and expand arrays
train_features_video_holdout = np.transpose(seq_array_train, (0, 1, 3, 2))
train_features_video_holdout = np.expand_dims(train_features_video_holdout, axis=-1)
print(train_features_video_holdout.shape, train_labels_video_holdout.shape)

val_features_video_holdout = np.transpose(seq_array_val, (0, 1, 3, 2))
val_features_video_holdout = np.expand_dims(val_features_video_holdout, axis=-1)
print(val_features_video_holdout.shape, val_labels_video_holdout.shape)

test_features_video_holdout = np.transpose(seq_array_test, (0, 1, 3, 2))
test_features_video_holdout = np.expand_dims(test_features_video_holdout, axis=-1)
print(test_features_video_holdout.shape, test_labels_video_holdout.shape)

new_train_list, indices = np.unique(train_video_names_holdout, return_index=True)
new_train_list = new_train_list[np.argsort(indices)].tolist()
new_val_list, indices = np.unique(val_video_names_holdout, return_index=True)
new_val_list = new_val_list[np.argsort(indices)].tolist()
new_test_list, indices = np.unique(test_video_names_holdout, return_index=True)
new_test_list = new_test_list[np.argsort(indices)].tolist()

In [None]:
cross_video_datasets = []
new_cross_list, new_cross_list_pad = [],[]
for i in range(4):
    fold_features_path = input_video_path + f"/fold_{i}_features.csv"
    cross_video_datasets.append(upload_dataset(fold_features_path))

cross_video_lists = [fold_dataset['input'].unique().tolist() for fold_dataset in cross_video_datasets]
for i in range(4):
    random.shuffle(cross_video_lists[i])

data_video_cross, labels_video_cross, names_video_cross = [],[],[]
data_video_cross_pad, labels_video_cross_pad, names_video_cross_pad = [],[],[]

for i in range(4):
    fold_dataset = cross_video_datasets[i]
    fold_list = cross_video_lists[i]
    
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding=None))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transform lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.expand_dims(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    data_video_cross.append(fold_sequences)
    labels_video_cross.append(fold_labels)
    names_video_cross.append(fold_video_names)

    fold_video_names, indices = np.unique(fold_video_names, return_index=True)
    fold_video_names = fold_video_names[np.argsort(indices)].tolist()
    new_cross_list.append(fold_video_names)
    print(len(new_cross_list[i]))

    # Padding
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transform lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.expand_dims(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    data_video_cross_pad.append(fold_sequences)
    labels_video_cross_pad.append(fold_labels)
    names_video_cross_pad.append(fold_video_names)

    fold_video_names, indices = np.unique(fold_video_names, return_index=True)
    fold_video_names = fold_video_names[np.argsort(indices)].tolist()
    new_cross_list_pad.append(fold_video_names)
    print(len(new_cross_list_pad[i]))

## AUDIO

In [None]:
def load_audio_data(audio_dataset_path, videos_list):
    """
    Loads raw audio signals and their corresponding labels from a dataset directory.

    :param audio_dataset_path: Path to the directory containing audio data and metadata (.xlsx)
    :param videos_list: List of video file paths to match with audio entries
    :return: Tuple (list of (signal, sample_rate), numpy array of labels)
    """
    audios = []
    audio_labels = []
    audio_dict = {}

    split = audio_dataset_path.split('/')[-1]

    # Extract audio data
    for root, dirs, files in os.walk(audio_dataset_path):
        for file in files:
            if file.endswith('.xlsx'):
                # Path to the Excel metadata file
                excel_path = os.path.join(root, file)
                df = pd.read_excel(excel_path)
                
                # Extract audio names, labels, and corresponding video names
                label_list = df['label'].tolist()
                audio_names = df['audio name'].tolist()
                video_names = df['video name'].tolist()
                
                # Load the corresponding audio files
                for audio_name, video_name, label in zip(audio_names, video_names, label_list):
                    audio_path = os.path.join(audio_dataset_path, 'Audio', audio_name)
                    
                    if os.path.exists(audio_path):
                        # Load the raw audio without preprocessing
                        signal, sample_rate = librosa.load(audio_path, sr=None)
                        audio_dict[video_name] = ((signal, sample_rate), label)
                    else:
                        print(f"Audio file not found: {audio_path}")

                # Match audio to each video in the list
                for video in videos_list:
                    video_name = video.split('/')[-1]
                    audios.append(audio_dict[video_name][0])
                    audio_labels.append(audio_dict[video_name][1])
    
    return audios, np.array(audio_labels)


def calculate_min_length(wav2vec_model):
    """
    Calculates the minimum input length required for a Wav2Vec2 model based on its convolutional layers.

    :param wav2vec_model: Pretrained Wav2Vec2 model
    :return: Minimum number of audio samples required
    """
    kernel_sizes = wav2vec_model.config.conv_kernel
    strides = wav2vec_model.config.conv_stride
    min_length = 1
    for k, s in zip(kernel_sizes, strides):
        min_length = (min_length - 1) * s + k
    return min_length


def pad_audio(signal, target_length):
    """
    Pads an audio signal with zeros to reach a target length.

    :param signal: Input 1D audio signal
    :param target_length: Desired length
    :return: Zero-padded signal of target_length
    """
    return np.pad(signal, (0, max(0, target_length - len(signal))), mode="constant")


def preprocess_audio_sequential_fixed_segments(signal, sample_rate, num_segments=20, n_mfcc=13):
    """
    Preprocesses an audio signal into fixed-length segments and extracts DWT and MFCC features.

    :param signal: Raw audio signal
    :param sample_rate: Sampling rate of the signal
    :param num_segments: Number of fixed-length segments to divide the signal into
    :param n_mfcc: Number of MFCC coefficients to extract
    :return: Numpy array of extracted features for each segment
    """
    # Noise filtering using STFT thresholding
    stft = np.abs(librosa.stft(signal))
    threshold = np.median(stft)
    stft_filtered = np.where(stft > threshold, stft, 0)
    signal_filtered = librosa.istft(stft_filtered)

    # Normalize signal between -1 and 1
    normalized_signal = signal_filtered / np.max(np.abs(signal_filtered))

    window_length = len(normalized_signal) // num_segments

    # Segment the signal into equal-length windows
    segmented_signal = []
    for i in range(num_segments):
        start_idx = i * window_length
        end_idx = start_idx + window_length
        segment = normalized_signal[start_idx:end_idx]
        segment = pad_audio(segment, window_length)
        segmented_signal.append(segment)
    segmented_signal = np.array(segmented_signal)

    # Apply Hamming window
    hamming_window = hamming(segmented_signal.shape[1])
    windowed_signal = segmented_signal * hamming_window

    # Extract features from each segment
    sequence_features = []
    for segment in windowed_signal:
        segment_features = []

        # DWT feature extraction
        coeffs = pywt.wavedec(segment, 'db4', level=5)
        for coeff in coeffs:
            variance = np.var(coeff)
            energy = np.sum(np.square(coeff))
            entropy = -np.sum(coeff * np.log2(np.abs(coeff) + 1e-12))
            kurtosis = 0 if variance == 0 else np.mean((coeff - np.mean(coeff))**4) / (variance**2)
            skewness = 0 if variance == 0 else np.mean((coeff - np.mean(coeff))**3) / (variance**1.5)
            std_dev = np.std(coeff)
            segment_features.extend([energy, entropy, kurtosis, skewness, std_dev])

        # MFCC feature extraction
        n_fft = min(2048, max(256, 2 ** int(np.floor(np.log2(len(segment))))))
        mfccs = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft)
        mfcc_features = np.mean(mfccs, axis=1)
        segment_features.extend(mfcc_features.tolist())        

        sequence_features.append(segment_features)

    return np.array(sequence_features)


def audio_features_extract(audio_data, num_segments=20, n_mfcc=13):
    """
    Extracts sequential audio features (DWT + MFCC) from a list of audio signals.

    :param audio_data: List of (signal, sample_rate) tuples
    :param num_segments: Number of segments per audio file
    :param n_mfcc: Number of MFCC coefficients to compute
    :return: Numpy array of shape (num_samples, num_segments, num_features)
    """
    features = []
    for signal, sample_rate in tqdm(audio_data, desc="Extracting audio features"):
        sequence_features = preprocess_audio_sequential_fixed_segments(
            signal, sample_rate, num_segments=num_segments, n_mfcc=n_mfcc
        )
        features.append(sequence_features)
    return np.array(features)

In [None]:
# Load data from dataset
train_audio_holdout, train_labels_audio_holdout = load_audio_data(train_audio_path, new_train_list)
val_audio_holdout, val_labels_audio_holdout = load_audio_data(val_audio_path, new_val_list)
test_audio_holdout, test_labels_audio_holdout = load_audio_data(test_audio_path, new_test_list)

# Extract features for each audio
train_features_audio_holdout = audio_features_extract(train_audio_holdout)
val_features_audio_holdout = audio_features_extract(val_audio_holdout)
test_features_audio_holdout = audio_features_extract(test_audio_holdout)

# Print dimensions
print(train_features_audio_holdout.shape, train_labels_audio_holdout.shape)
print(val_features_audio_holdout.shape, val_labels_audio_holdout.shape)
print(test_features_audio_holdout.shape, test_labels_audio_holdout.shape)

In [None]:
fold_dirs = [f"{cross_audio_path}/fold_{index}" for index in range(4)]
 
data_audio_cross, labels_audio_cross = [], []
data_audio_cross_pad, labels_audio_cross_pad = [], []
 
# Loading and feature extraction for each fold
for i in range(4):
    print(f"Processing fold {i}...")

    # Load data from original fold
    fold_audio, fold_audio_labels = load_audio_data(fold_dirs[i], new_cross_list[i])
    
    # Extracting audio features
    fold_audio_features = audio_features_extract(fold_audio)

    print(fold_audio_features.shape, fold_audio_labels.shape)
    
    # Saving into respective arrays
    data_audio_cross.append(fold_audio_features)
    labels_audio_cross.append(fold_audio_labels)

    # Load data from padded fold
    fold_audio, fold_audio_labels = load_audio_data(fold_dirs[i], new_cross_list_pad[i])
    
    # Extract audio features
    fold_audio_features = audio_features_extract(fold_audio)

    print(fold_audio_features.shape, fold_audio_labels.shape)
    
    # Saving into respective arrays
    data_audio_cross_pad.append(fold_audio_features)
    labels_audio_cross_pad.append(fold_audio_labels)

# Models

In [None]:
# Function to create video model
def video_Conv3D_model(input_shape, learning_rate=1e-4, p=0):
    model = Sequential([
        InputLayer(shape=input_shape),
        
        # Conv3D Block
        Conv3D(filters=32, kernel_size=(3, 3, 2), activation='relu'),
        BatchNormalization(),
        MaxPooling3D(pool_size=(2, 2, 1)),  # Riduce solo altezza e larghezza

        # Flatten e Fully Connected
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Output per classificazione binaria
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    if p:
        print(model.summary())
        
    return model

In [None]:
def audio_lstm_model(input_shape, hidden_size, learning_rate):
    model = Sequential([
        InputLayer(shape=input_shape),
        Masking(mask_value=0.0),
        LSTM(hidden_size, return_sequences=False, use_cudnn=False),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_and_evaluate_audio_lstm_model(
    train_features, train_labels,
    val_features, val_labels,
    test_features, test_labels,
    hidden_size=128,
    num_epochs=20,
    batch_size=32,
    learning_rate=0.001,
    view_res=True
):
    # Definition of LSTM model
    model = audio_lstm_model(input_shape=(train_features.shape[1], train_features.shape[2]),
                       hidden_size = hidden_size, learning_rate=learning_rate)
    
    # Model Training
    history = model.fit(
        train_features, train_labels,
        validation_data=(val_features, val_labels),
        epochs=num_epochs,
        batch_size=batch_size,
        verbose=view_res
    )

    if view_res:
        plot_history(history)
    
    return model

In [None]:
def plot_history(history):
    """
    Plots training and validation accuracy and loss over epochs.

    :param history: Keras History object returned by model.fit()
    """
    plt.figure(figsize=(14, 5))

    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')

    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')

    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(conf_matrix):
    """
    Plots a heatmap of the confusion matrix.

    :param conf_matrix: 2D array-like confusion matrix (e.g. from sklearn.metrics.confusion_matrix)
    """
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=['Lie', 'Truth'], yticklabels=['Lie', 'Truth'])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()


def plot_confusion_matrix_2(y_true, y_pred):
    """
    Computes and displays the confusion matrix using sklearn's ConfusionMatrixDisplay.

    :param y_true: Ground truth labels
    :param y_pred: Predicted labels
    """
    fig, ax = plt.subplots(figsize=(5, 5))
    cm = confusion_matrix(y_true, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix=cm)
    cm_display.plot(ax=ax)
    plt.show()

## Late Fusion (Mean):
The best models of the Video and Audio classification tasks are taken, after which the probabilities associated with each prediction (at the video level) are taken and combined linearly to obtain a combined answer.

In [None]:
def combine_predictions_weighted(video_predictions, audio_predictions, audio_weight=0.5):
    video_weight = 1 - audio_weight
    combined_predictions = (video_predictions * video_weight) + (audio_predictions * audio_weight)
    combined_classes = (combined_predictions > 0.5).astype(int)
    return combined_classes, combined_predictions  

In [None]:
# Video model training and prediction
print(f"\nVIDEO TRAINING\n")
input_shape_video_holdout = train_features_video_holdout.shape[1:]
learning_rate = 1e-4
epochs = 20
batch_size = 256

callbacks = [ModelCheckpoint(model_path, monitor='val_accuracy', mode='max',
                             save_best_only=True, save_weights_only=True)]

video_model_holdout_weight = video_Conv3D_model(input_shape_video_holdout, learning_rate)

# Fit the network
history_video_holdout_weight = video_model_holdout_weight.fit(
    train_features_video_holdout,
    train_labels_video_holdout,
    epochs=epochs, batch_size=batch_size,
    validation_data=(val_features_video_holdout, val_labels_video_holdout),
    callbacks=callbacks)

plot_history(history_video_holdout_weight)
video_model_holdout_weight.load_weights(model_path)

# Make predictions
pred = video_model_holdout_weight.predict(test_features_video_holdout)

# Create a dataframe with predictions and video ids
pred_df = pd.DataFrame({'video_id': test_video_names_holdout, 'prediction': pred.flatten(), 'label': test_labels_video_holdout})

# Function to calculate the average
def calculate_mean(preds):
    return np.mean(preds)

# Group by video and compute the average prediction
video_predictions_holdout_weight = pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_mean).values
video_predictions_holdout_weight = np.array([[x] for x in video_predictions_holdout_weight])


# Audio model training and prediction
print(f"\n\n\nAUDIO TRAINING\n")
audio_model_holdout_weight = train_and_evaluate_audio_lstm_model(
    train_features_audio_holdout, train_labels_audio_holdout,
    val_features_audio_holdout, val_labels_audio_holdout,
    test_features_audio_holdout, test_labels_audio_holdout,
    hidden_size=128, num_epochs=20, batch_size=32, learning_rate=0.001, view_res=True
)
audio_predictions_holdout_weight = audio_model_holdout_weight.predict(test_features_audio_holdout)  # Predicted probabilities


# Combine predictions with specific weights
print(f"\n\n\nCOMBINING RESULTS\n")
audio_weight = 0  # Weight assigned to audio predictions
combined_classes_holdout_weight, combined_probs_holdout_weight = combine_predictions_weighted(
    video_predictions_holdout_weight, audio_predictions_holdout_weight, audio_weight=audio_weight
)

# Metrics calculation
accuracy_holdout_weight = accuracy_score(test_labels_audio_holdout, combined_classes_holdout_weight)
precision_holdout_weight = precision_score(test_labels_audio_holdout, combined_classes_holdout_weight, zero_division=0, average=None)
recall_holdout_weight = recall_score(test_labels_audio_holdout, combined_classes_holdout_weight, zero_division=0, average=None)
f1_holdout_weight = f1_score(test_labels_audio_holdout, combined_classes_holdout_weight, zero_division=0, average=None)
auc_holdout_weight = roc_auc_score(test_labels_audio_holdout, combined_probs_holdout_weight)

# Results visualization
conf_matrix_holdout_weight = confusion_matrix(test_labels_audio_holdout, combined_classes_holdout_weight)
plot_confusion_matrix(conf_matrix_holdout_weight)

# Save metrics into the dictionary
metrics_holdout["Accuracy"].append(accuracy_holdout_weight)
metrics_holdout["Precision (0)"].append(precision_holdout_weight[0])
metrics_holdout["Precision (1)"].append(precision_holdout_weight[1])
metrics_holdout["Recall (0)"].append(recall_holdout_weight[0])
metrics_holdout["Recall (1)"].append(recall_holdout_weight[1])
metrics_holdout["F1 (0)"].append(f1_holdout_weight[0])
metrics_holdout["F1 (1)"].append(f1_holdout_weight[1])
metrics_holdout["AUC"].append(auc_holdout_weight)

# Dataframe creation
df = pd.DataFrame.from_dict(metrics_holdout, orient='index', columns=["LATE FUSION (Mean)"])
print(df)
metrics_holdout = {key: [] for key in metrics_holdout}

### Cross-Validation

In [None]:
def cross_validation_video_audio(metrics_dict, data_video, data_video_pad, data_audio, data_audio_pad, 
                                 labels_video, labels_video_pad, labels_audio, labels_audio_pad,
                                 names_video, names_video_pad, num_folds=4):
    """
    Cross-validation function that combines video and audio features for binary classification.
    """
    accuracy_scores, auc_scores = [], []
    precision_scores_0, recall_scores_0, f1_scores_0 = [], [], []
    precision_scores_1, recall_scores_1, f1_scores_1 = [], [], []
 
    for i in range(num_folds):
        print(f'\nFold {i}')
        
        # Split the folds into train and test sets
        test_features_video = data_video_pad[i]
        test_labels_video = labels_video_pad[i]
        test_video_names = names_video_pad[i]
        test_features_audio, test_labels_audio = np.array(data_audio_pad[i]), np.array(labels_audio_pad[i])

        train_features_video, train_labels_video, train_video_names = [], [], []
        for j in range(num_folds):
            if j != i:
                train_features_video.append(data_video[j])
                train_labels_video.append(labels_video[j])
                train_video_names.extend(names_video[j])
        train_features_video = np.vstack(train_features_video)
        train_labels_video = np.hstack(train_labels_video)
        train_features_audio = np.array([item for idx, fold in enumerate(data_audio) if idx != i for item in fold])
        train_labels_audio = np.array([label for idx, fold in enumerate(labels_audio) if idx != i for label in fold])

        
        # Create and train the model for video features
        input_shape = train_features_video.shape[1:]
        learning_rate = 1e-4
        epochs = 20
        batch_size = 256
        video_model = video_Conv3D_model(input_shape, learning_rate=learning_rate)
        video_history = video_model.fit(train_features_video, train_labels_video, 
                            validation_data = (test_features_video, test_labels_video),
                            epochs=epochs, batch_size=batch_size, verbose=0)
        video_test_predictions = video_model.predict(test_features_video)
        video_pred_df = pd.DataFrame({'video_id': test_video_names, 'prediction': video_test_predictions.flatten(), 'label': test_labels_video})
        
        def calculate_mean(preds):
            return np.mean(preds)
        
        video_predictions = video_pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_mean).values
        video_predictions = np.array([[x] for x in video_predictions])

        #plot_history(video_history)

        
        # Create and train the model for audio features
        audio_model = train_and_evaluate_audio_lstm_model(
            train_features_audio, train_labels_audio,
            test_features_audio, test_labels_audio,
            test_features_audio, test_labels_audio,
            hidden_size=128, num_epochs=20, batch_size=32, learning_rate=0.001, view_res=False
        )
        audio_predictions = audio_model.predict(test_features_audio)  # Audio probabilities
        
        # Group by video and get labels
        video_labels = video_pred_df.groupby('video_id', sort=False)['label'].first().values

        # Combine predictions with weights
        audio_weight = 0.5
        combined_predictions, combined_probs = combine_predictions_weighted(video_predictions, audio_predictions, audio_weight)
        
        # Compute metrics for the current fold
        accuracy = accuracy_score(test_labels_audio, combined_predictions)
        precision = precision_score(test_labels_audio, combined_predictions, zero_division=0, average=None)
        recall = recall_score(test_labels_audio, combined_predictions, zero_division=0, average=None)
        f1 = f1_score(test_labels_audio, combined_predictions, average=None)
        auc = roc_auc_score(test_labels_audio, combined_probs)
 
        # Store metrics for this fold
        accuracy_scores.append(accuracy)
        precision_scores_0.append(precision[0])
        recall_scores_0.append(recall[0])
        f1_scores_0.append(f1[0])
        precision_scores_1.append(precision[1])
        recall_scores_1.append(recall[1])
        f1_scores_1.append(f1[1])
        auc_scores.append(auc)
 
        print(f"Fold {i} - Accuracy: {accuracy:.2f}")

        # Predictions using only video
        audio_weight = 0
        combined_predictions, combined_probs = combine_predictions_weighted(video_predictions, audio_predictions, audio_weight)
        print(f"only-video acc: {accuracy_score(video_labels, combined_predictions)}")

        # Predictions using only audio
        audio_weight = 1
        combined_predictions, combined_probs = combine_predictions_weighted(video_predictions, audio_predictions, audio_weight)
        print(f"only-audio acc: {accuracy_score(test_labels_audio, combined_predictions)}")
 
    # Average metrics across all folds
    avg_accuracy = np.mean(accuracy_scores)
    avg_precision_0, avg_recall_0, avg_f1_0 = np.mean(precision_scores_0), np.mean(recall_scores_0), np.mean(f1_scores_0)
    avg_precision_1, avg_recall_1, avg_f1_1 = np.mean(precision_scores_1), np.mean(recall_scores_1), np.mean(f1_scores_1)
    avg_auc = np.mean(auc_scores)

    metrics_dict["Mean Accuracy"].append(avg_accuracy)
    metrics_dict["Mean Precision (0)"].append(avg_precision_0)
    metrics_dict["Mean Precision (1)"].append(avg_precision_1)
    metrics_dict["Mean Recall (0)"].append(avg_recall_0)
    metrics_dict["Mean Recall (1)"].append(avg_recall_1)
    metrics_dict["Mean F1 (0)"].append(avg_f1_0)
    metrics_dict["Mean F1 (1)"].append(avg_f1_1)
    metrics_dict["Mean AUC"].append(avg_auc)

    # Standard deviation of the metrics across all folds
    std_accuracy = np.std(accuracy_scores)
    std_precision_0, std_recall_0, std_f1_0 = np.std(precision_scores_0), np.std(recall_scores_0), np.std(f1_scores_0)
    std_precision_1, std_recall_1, std_f1_1 = np.std(precision_scores_1), np.std(recall_scores_1), np.std(f1_scores_1)
    std_auc = np.std(auc_scores)
    
    metrics_dict["Std Accuracy"].append(std_accuracy)
    metrics_dict["Std Precision (0)"].append(std_precision_0)
    metrics_dict["Std Precision (1)"].append(std_precision_1)
    metrics_dict["Std Recall (0)"].append(std_recall_0)
    metrics_dict["Std Recall (1)"].append(std_recall_1)
    metrics_dict["Std F1 (0)"].append(std_f1_0)
    metrics_dict["Std F1 (1)"].append(std_f1_1)
    metrics_dict["Std AUC"].append(std_auc)

    print(f"\nAvg Accuracy: {avg_accuracy:.2f}")
    print(f"Std Dev Accuracy: {std_accuracy:.2f}")

In [None]:
cross_validation_video_audio(metrics_cross, data_video_cross, data_video_cross_pad, 
                                data_audio_cross, data_audio_cross_pad,
                                 labels_video_cross, labels_video_cross_pad, labels_audio_cross,
                                 labels_audio_cross_pad,
                                 names_video_cross, names_video_cross_pad, num_folds=4)

# Dataframe creation
df = pd.DataFrame.from_dict(metrics_cross, orient='index', columns=["LATE FUSION (Mean)"])
print("\n")
print(df)
metrics_cross = {key: [] for key in metrics_cross}

## Late Fusion (Meta Learner):
The results of the Video and Audio classification tasks are taken, 
then given as input to a MetaLearner.

In [None]:
def generate_meta_features(video_features, audio_features, video_labels, audio_labels, video_names,
                           n_splits=10):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    meta_features = []
    meta_labels = []
    set_video_names, indices = np.unique(video_names, return_index=True)
    set_video_names = set_video_names[np.argsort(indices)]
    
    for i, (train_index, val_index) in enumerate(skf.split(audio_features, audio_labels)):
        print(f'Fold_{i}:')

        train_video_names, val_video_names = set_video_names[train_index], set_video_names[val_index]
        for name in train_video_names:
            train_video_index = [j for j, n in enumerate(video_names) if n in train_video_names]
            val_video_index = [j for j, n in enumerate(video_names) if n in val_video_names]
        train_video, val_video, val_video_names = video_features[train_video_index], video_features[val_video_index], video_names[val_video_index]
        train_video_labels, val_video_labels = video_labels[train_video_index], video_labels[val_video_index]
        train_audio, val_audio = audio_features[train_index], audio_features[val_index]
        train_audio_labels, val_audio_labels = audio_labels[train_index], audio_labels[val_index]

        print(train_video.shape, train_video_labels.shape)
        print(val_video.shape, val_video_labels.shape)
        print(train_audio.shape, train_audio_labels.shape)
        print(val_audio.shape, val_audio_labels.shape)

        # Base models training
        input_shape = train_video.shape[1:]
        learning_rate = 1e-4
        epochs = 20
        batch_size = 256
        video_model = video_Conv3D_model(input_shape, learning_rate=learning_rate)
        video_history = video_model.fit(train_video, train_video_labels, 
                            validation_data = (val_video, val_video_labels),
                            epochs=epochs, batch_size=batch_size, verbose=0)

        audio_model = train_and_evaluate_audio_lstm_model(
            train_audio, train_audio_labels,
            val_audio, val_audio_labels,
            val_audio, val_audio_labels,  
            hidden_size=128, num_epochs=20, batch_size=32, learning_rate=0.001, view_res=False
        )
        
        # Predictions on validation fold
        video_val_predictions = video_model.predict(val_video)
        video_pred_df = pd.DataFrame({'video_id': val_video_names, 'prediction': video_val_predictions.flatten(), 'label': val_video_labels})
        def calculate_mean(preds):
            return np.mean(preds)
        video_predictions = video_pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_mean).values
        video_predictions = np.array([[x] for x in video_predictions])
        video_preds = video_predictions.flatten().reshape(-1, 1)
        
        audio_preds = audio_model.predict(val_audio).flatten().reshape(-1, 1)

        val_video_labels = video_pred_df.groupby('video_id', sort=False)['label'].first().values
        print(val_audio_labels)
        print(val_video_labels)
        
        # Saving predictions and labels
        fold_meta_features = np.hstack((video_preds, audio_preds))
        meta_features.append(fold_meta_features)
        meta_labels.append(val_audio_labels)
    
    # Concatenate results af all folds
    meta_features = np.vstack(meta_features)
    meta_labels = np.hstack(meta_labels)
    
    return video_model, audio_model, meta_features, meta_labels

In [None]:
# Pre-processes training data with padding so that it can be merged with validation data
seq_gen_train = list(list(gen_sequence(train_dataset[train_dataset['input'] == id], 
                                     sequence_length, cols_groups, frame_step, 
                                     padding))
                   for id in train_list)
seq_gen_train = [x for x in seq_gen_train if len(x) > 0]
seq_array_train = [[t[0] for t in sublist] for sublist in seq_gen_train]
label_array_train = [[t[1] for t in sublist] for sublist in seq_gen_train]
video_array_train = [[t[2] for t in sublist] for sublist in seq_gen_train]
seq_array_train = np.concatenate(seq_array_train).astype(np.float32)
train_labels_video_holdout_stack = np.concatenate(label_array_train).astype(np.float32).reshape(-1)
train_video_names_holdout_stack = np.concatenate(video_array_train)
train_features_video_holdout_stack = np.transpose(seq_array_train, (0, 1, 3, 2))
train_features_video_holdout_stack = np.expand_dims(train_features_video_holdout_stack, axis=-1)

new_train_list_stack, indices = np.unique(train_video_names_holdout_stack, return_index=True)
new_train_list_stack = new_train_list_stack[np.argsort(indices)].tolist()

train_audio_holdout_stack, train_labels_audio_holdout_stack = load_audio_data(train_audio_path, new_train_list_stack)
train_features_audio_holdout_stack = audio_features_extract(train_audio_holdout_stack)

# Combine training and validation features
train_features_video_holdout_stack = np.vstack([train_features_video_holdout_stack, val_features_video_holdout])
train_features_audio_holdout_stack = np.vstack([train_features_audio_holdout_stack, val_features_audio_holdout])
train_video_labels_holdout_stack = np.hstack([train_labels_video_holdout_stack, val_labels_video_holdout])
train_audio_labels_holdout_stack = np.hstack([train_labels_audio_holdout_stack, val_labels_audio_holdout])
train_video_names_holdout_stack = np.hstack([train_video_names_holdout_stack, val_video_names_holdout])

# Print dimensions
print(train_features_video_holdout_stack.shape)
print(test_features_video_holdout.shape)
print()

print(train_features_audio_holdout_stack.shape)
print(test_features_audio_holdout.shape)
print()

print(train_video_labels_holdout_stack.shape)
print(test_labels_video_holdout.shape)

print()
print(train_video_names_holdout_stack.shape)

In [None]:
# Dataset generation for the meta-learner
video_model_holdout_stack, audio_model_holdout_stack, meta_features_holdout, meta_labels_holdout = generate_meta_features( 
    video_features=train_features_video_holdout_stack, 
    audio_features=train_features_audio_holdout_stack,
    video_labels=train_video_labels_holdout_stack,
    audio_labels=train_audio_labels_holdout_stack,
    video_names=train_video_names_holdout_stack,
    n_splits=5
)
                                                         
# Training the meta-learner
stacking_model_holdout = LogisticRegression()
stacking_model_holdout.fit(meta_features_holdout, meta_labels_holdout)

# Evaluation on the test set
video_test_preds_holdout_stack = video_model_holdout_stack.predict(test_features_video_holdout)
video_pred_df = pd.DataFrame({'video_id': test_video_names_holdout, 'prediction': video_test_preds_holdout_stack.flatten(), 'label': test_labels_video_holdout})

def calculate_mean(preds):
    return np.mean(preds)

video_test_preds_holdout_stack = video_pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_mean).values
video_test_preds_holdout_stack = np.array([[x] for x in video_test_preds_holdout_stack])
video_test_preds_holdout_stack = video_test_preds_holdout_stack.flatten().reshape(-1, 1)
audio_test_preds_holdout_stack = audio_model_holdout_stack.predict(test_features_audio_holdout).flatten().reshape(-1, 1)
test_meta_features_holdout_stack = np.hstack((video_test_preds_holdout_stack, audio_test_preds_holdout_stack))

# Prediction with the meta-learner
stacking_preds_holdout = stacking_model_holdout.predict(test_meta_features_holdout_stack)  # Predicted classes
stacking_probs_holdout = stacking_model_holdout.predict_proba(test_meta_features_holdout_stack)[:, 1]  # Probability for the positive class

test_labels_video_holdout_stack = video_pred_df.groupby('video_id', sort=False)['label'].first().values
print(test_labels_audio_holdout)
print(test_labels_video_holdout_stack)

# Metrics calculation
accuracy_holdout_stack = accuracy_score(test_labels_audio_holdout, stacking_preds_holdout)
precision_holdout_stack, recall_holdout_stack, f1_holdout_stack, _ = precision_recall_fscore_support(
    test_labels_audio_holdout, stacking_preds_holdout, average=None, labels=[0, 1]
)
auc_holdout_stack = roc_auc_score(test_labels_audio_holdout, stacking_probs_holdout)
 
# Save metrics to the dictionary
metrics_holdout["Accuracy"].append(accuracy_holdout_stack)
metrics_holdout["Precision (0)"].append(precision_holdout_stack[0])
metrics_holdout["Precision (1)"].append(precision_holdout_stack[1])
metrics_holdout["Recall (0)"].append(recall_holdout_stack[0])
metrics_holdout["Recall (1)"].append(recall_holdout_stack[1])
metrics_holdout["F1 (0)"].append(f1_holdout_stack[0])
metrics_holdout["F1 (1)"].append(f1_holdout_stack[1])
metrics_holdout["AUC"].append(auc_holdout_stack)

# Dataframe creation
df = pd.DataFrame.from_dict(metrics_holdout, orient='index', columns=["LATE FUSION (Meta-Learner)"])
print(df)
metrics_holdout = {key: [] for key in metrics_holdout}

## Intermediate Fusion:
We extract sequences of 64 frames from each video, then extract the audio corresponding to that part of the video and organize it into sequences too. Finally, we train a model consisting of a first part that works separately on the video and audio features, and then finish with a single MLP network for final binary classification. The commented parts were used to extract the new features, which were then downloaded so that the extraction process would not have to be repeated.

In [None]:
def load_audio_data_segmented(audio_dataset_path, video_dataset, seq_len=sequence_length*frame_step, frame_step=frame_step, padding=None):
    audios = []
    audio_labels = []
    audio_names_list = []

    # Extract audio data
    for root, dirs, files in os.walk(audio_dataset_path):
        for file in files:
            if file.endswith('.xlsx'):
                
                # Path of the Excel file
                excel_path = os.path.join(root, file)
                df = pd.read_excel(excel_path)
                
                # Extract audio names, video names and label
                label_list = df['label'].tolist()
                audio_names = df['audio name'].tolist()
                video_names = df['video name'].tolist()
                
                # Segment the corresponding audio files
                for label, audio_name, video_name in zip(label_list, audio_names, video_names):                    
                    audio_path = os.path.join(audio_dataset_path, 'Audio', audio_name)
                    video_path = os.path.join(audio_dataset_path, 'Statements', video_name)
                    frame_dataset = video_dataset[video_dataset['input'] == video_path]
                    total_frames = len(frame_dataset)
                    
                    if os.path.exists(audio_path):
                        signal, sample_rate = librosa.load(audio_path, sr=None)  # Load audio with original sample rate
                        total_duration_ms = librosa.get_duration(y=signal, sr=sample_rate) * 1000  # Total audio duration in milliseconds
                        frame_rate = total_frames / total_duration_ms
                        seq_duration_ms = seq_len / frame_rate
                        
                        # Extract audio segments corresponding to video sequences
                        segments = []
                        start_time_ms = 0
                        while np.round(start_time_ms + seq_duration_ms, 3) <= np.round(total_duration_ms, 3):
                            start_sample = int((start_time_ms / 1000) * sample_rate)
                            end_sample = int(((start_time_ms + seq_duration_ms) / 1000) * sample_rate)
                            segments.append(signal[start_sample:end_sample])
                            start_time_ms += frame_step / frame_rate
                            
                        if not segments and padding:
                            segments.append(signal)

                        for segment in segments:
                            audios.append((segment, sample_rate))
                            audio_labels.append(label)
                            audio_names_list.append(audio_name)
                    else:
                        print(f"Audio file not found: {audio_path}")
    
    return audios, np.array(audio_labels), np.array(audio_names_list)

num_segments = 20

# Load data from the dataset
train_audio_holdout_inter, train_labels_audio_holdout_inter, train_audio_names_holdout_inter = load_audio_data_segmented(train_audio_path, train_dataset, sequence_length*frame_step, frame_step, padding=None)
val_audio_holdout_inter, val_labels_audio_holdout_inter, val_audio_names_holdout_inter = load_audio_data_segmented(val_audio_path, val_dataset, sequence_length*frame_step, frame_step, padding)
test_audio_holdout_inter, test_labels_audio_holdout_inter, test_audio_names_holdout_inter = load_audio_data_segmented(test_audio_path, test_dataset, sequence_length*frame_step, frame_step, padding)

"""# Feature extraction for each audio
train_features_audio_holdout_inter = audio_features_extract(train_audio_holdout_inter, num_segments=num_segments)
val_features_audio_holdout_inter = audio_features_extract(val_audio_holdout_inter, num_segments=num_segments)
test_features_audio_holdout_inter = audio_features_extract(test_audio_holdout_inter, num_segments=num_segments)

# Print shapes
print(train_features_audio_holdout_inter.shape, train_labels_audio_holdout_inter.shape)
print(val_features_audio_holdout_inter.shape, val_labels_audio_holdout_inter.shape)
print(test_features_audio_holdout_inter.shape, test_labels_audio_holdout_inter.shape)"""

fold_dirs = [f"{cross_audio_path}/fold_{index}" for index in range(4)]
data_audio_cross_inter, labels_audio_cross_inter, names_audio_cross_inter = [], [], []
data_audio_cross_pad_inter, labels_audio_cross_pad_inter, names_audio_cross_pad_inter = [], [], []
 
# Load and extract features for each fold
for i in range(4):
    print(f"Processing fold {i}...")

    # No padding
    fold_audio, fold_audio_labels, fold_audio_names = load_audio_data_segmented(fold_dirs[i], cross_video_datasets[i], padding=None)
    #fold_audio_features = audio_features_extract(fold_audio, num_segments=num_segments)
    #print(fold_audio_features.shape, fold_audio_labels.shape)
    #data_audio_cross_inter.append(fold_audio_features)
    #labels_audio_cross_inter.append(fold_audio_labels)
    names_audio_cross_inter.append(fold_audio_names)

    # Padding
    fold_audio, fold_audio_labels, fold_audio_names = load_audio_data_segmented(fold_dirs[i], cross_video_datasets[i], padding=padding)
    #fold_audio_features = audio_features_extract(fold_audio, num_segments=num_segments)
    #print(fold_audio_features.shape, fold_audio_labels.shape)
    #data_audio_cross_pad_inter.append(fold_audio_features)
    #labels_audio_cross_pad_inter.append(fold_audio_labels)
    names_audio_cross_pad_inter.append(fold_audio_names)

"""os.makedirs("/kaggle/working/inter_array", exist_ok=True)

# Save arrays for later use
np.save("inter_array/train_features_audio.npy", train_features_audio_holdout_inter)
np.save("inter_array/val_features_audio.npy", val_features_audio_holdout_inter)
np.save("inter_array/test_features_audio.npy", test_features_audio_holdout_inter)
np.save("inter_array/train_labels_audio.npy", train_labels_audio_holdout_inter)
np.save("inter_array/val_labels_audio.npy", val_labels_audio_holdout_inter)
np.save("inter_array/test_labels_audio.npy", test_labels_audio_holdout_inter)
for i in range(4):
    np.save(f"inter_array/fold_{i}_features_audio.npy", data_audio_cross_inter[i])
    np.save(f"inter_array/fold_{i}_labels_audio.npy", labels_audio_cross_inter[i])
    np.save(f"inter_array/fold_{i}_features_audio_pad.npy", data_audio_cross_pad_inter[i])
    np.save(f"inter_array/fold_{i}_labels_audio_pad.npy", labels_audio_cross_pad_inter[i])

import shutil

# Path to the dataset folder
dataset_dir = "/kaggle/working/inter_array"
 
# Path to save the zip file in the Kaggle output directory
output_zip = f"/kaggle/working/truthlie_audio_original_{sequence_length}_frame_{num_segments}_30.zip"
 
# Create the zip file
shutil.make_archive(output_zip.replace(".zip", ""), 'zip', dataset_dir)
 
print(f"The dataset has been compressed in {output_zip}. You can download it from Kaggle's output.")"""

In [None]:
sequence_length = 64
num_segments = 20

features_audio_input_path = f"/kaggle/input/truthlie-audio-features/truthlie_audio_original_{sequence_length}_frame_{num_segments}_30"

# Load previously saved arrays
train_features_audio_holdout_inter = np.load(features_audio_input_path + "/train_features_audio.npy")
val_features_audio_holdout_inter = np.load(features_audio_input_path + "/val_features_audio.npy")
test_features_audio_holdout_inter = np.load(features_audio_input_path + "/test_features_audio.npy")
train_labels_audio_holdout_inter = np.load(features_audio_input_path + "/train_labels_audio.npy")
val_labels_audio_holdout_inter = np.load(features_audio_input_path + "/val_labels_audio.npy")
test_labels_audio_holdout_inter = np.load(features_audio_input_path + "/test_labels_audio.npy")
print(train_features_audio_holdout_inter.shape, train_labels_audio_holdout_inter.shape)
print(val_features_audio_holdout_inter.shape, val_labels_audio_holdout_inter.shape)
print(test_features_audio_holdout_inter.shape, test_labels_audio_holdout_inter.shape)

data_audio_cross_inter, labels_audio_cross_inter = [], []
data_audio_cross_pad_inter, labels_audio_cross_pad_inter = [], []
for i in range(4):
    data_audio_cross_inter.append(np.load(features_audio_input_path + f"/fold_{i}_features_audio.npy"))
    labels_audio_cross_inter.append(np.load(features_audio_input_path + f"/fold_{i}_labels_audio.npy"))
    print(data_audio_cross_inter[i].shape, labels_audio_cross_inter[i].shape)

    data_audio_cross_pad_inter.append(np.load(features_audio_input_path + f"/fold_{i}_features_audio_pad.npy"))
    labels_audio_cross_pad_inter.append(np.load(features_audio_input_path + f"/fold_{i}_labels_audio_pad.npy"))
    print(data_audio_cross_pad_inter[i].shape, labels_audio_cross_pad_inter[i].shape)

In [None]:
def get_new_audio_array_inter(features_list,labels_list,audios_list,video_list):
    new_audio_dict_inter_features = {}
    new_audio_dict_inter_labels = {}
    
    for feature,label,name in zip(features_list,labels_list,audios_list):
        real_name = name[:-4]
        if real_name not in new_audio_dict_inter_features.keys():
            new_audio_dict_inter_features[real_name] = []
            new_audio_dict_inter_labels[real_name] = []
        new_audio_dict_inter_features[real_name].append(feature)
        new_audio_dict_inter_labels[real_name].append(label)
    
    new_audio_test_features = []
    new_audio_test_labels = []
    for name in video_list:
        real_name = name.split('/')[-1][:-4]
        new_audio_test_features.extend(new_audio_dict_inter_features[real_name])
        new_audio_test_labels.extend(new_audio_dict_inter_labels[real_name])
    
    return np.array(new_audio_test_features), np.array(new_audio_test_labels)

train_features_audio_holdout_inter, train_labels_audio_holdout_inter = get_new_audio_array_inter(train_features_audio_holdout_inter,train_labels_audio_holdout_inter,train_audio_names_holdout_inter,new_train_list)
print(train_features_audio_holdout_inter.shape, train_labels_audio_holdout_inter.shape)
val_features_audio_holdout_inter, val_labels_audio_holdout_inter = get_new_audio_array_inter(val_features_audio_holdout_inter,val_labels_audio_holdout_inter,val_audio_names_holdout_inter,new_val_list)
print(val_features_audio_holdout_inter.shape, val_labels_audio_holdout_inter.shape)
test_features_audio_holdout_inter, test_labels_audio_holdout_inter = get_new_audio_array_inter(test_features_audio_holdout_inter,test_labels_audio_holdout_inter,test_audio_names_holdout_inter,new_test_list)
print(test_features_audio_holdout_inter.shape, test_labels_audio_holdout_inter.shape)
for i in range(4):
    fold_features_audio_inter, fold_labels_audio_inter = get_new_audio_array_inter(data_audio_cross_inter[i],labels_audio_cross_inter[i],names_audio_cross_inter[i],new_cross_list[i])
    print(fold_features_audio_inter.shape, fold_labels_audio_inter.shape)
    data_audio_cross_inter[i]=fold_features_audio_inter
    labels_audio_cross_inter[i]=fold_labels_audio_inter

    fold_features_audio_inter, fold_labels_audio_inter = get_new_audio_array_inter(data_audio_cross_pad_inter[i],labels_audio_cross_pad_inter[i],names_audio_cross_pad_inter[i],new_cross_list_pad[i])
    print(fold_features_audio_inter.shape, fold_labels_audio_inter.shape)
    data_audio_cross_pad_inter[i]=fold_features_audio_inter
    labels_audio_cross_pad_inter[i]=fold_labels_audio_inter

In [None]:
print(all(train_labels_audio_holdout_inter == train_labels_video_holdout))
print(all(val_labels_audio_holdout_inter == val_labels_video_holdout))
print(all(test_labels_audio_holdout_inter == test_labels_video_holdout))

for i in range(4):
    print(all(labels_audio_cross_inter[i] == labels_video_cross[i]))
    print(all(labels_audio_cross_pad_inter[i] == labels_video_cross_pad[i]))

In [None]:
def create_combined_model(video_feature_size, max_seq_len_audio, audio_feature_size):
    # Video model
    shape = video_feature_size[2]
    video_input = Input(shape=video_feature_size, name="Video_Input")
    video_conv3D = Conv3D(filters=32, kernel_size=(3, 3, shape), activation='relu', name="Video_Conv3D")(video_input)
    video_batchnorm = BatchNormalization(name="Video_Batchnorm")(video_conv3D)
    video_pooling = MaxPooling3D(pool_size=(2, shape, 1), name="Video_Pooling")(video_batchnorm)
    video_flatten = Flatten(name="Video_Flatten")(video_pooling)
     
    # Audio model
    audio_input = Input(shape=(max_seq_len_audio, audio_feature_size), name="Audio_Input")
    audio_lstm = LSTM(128, return_sequences=False, name="Audio_LSTM")(audio_input)
     
    # Concatenation and classification
    combined = Concatenate(name="Concatenate")([video_flatten, audio_lstm])
    combined_bn = BatchNormalization(name="BatchNormalization")(combined)
    final_dense = Dense(32, activation="relu", name="Final_Dense")(combined_bn)
    final_dropout = Dropout(0.3, name="Final_Dropout")(final_dense)
    output = Dense(1, activation="sigmoid", name="Output")(final_dropout)
     
    # Final model
    model = Model(inputs=[video_input, audio_input], outputs=output)
     
    # Model compilation
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
     
    return model

In [None]:
# Model creation
combined_model_holdout = create_combined_model(train_features_video_holdout.shape[1:],
                                       train_features_audio_holdout_inter.shape[1],
                                       train_features_audio_holdout_inter.shape[2]
                                      )

# Model training
combined_history_holdout = combined_model_holdout.fit(
    [train_features_video_holdout, train_features_audio_holdout_inter], train_labels_video_holdout,
    validation_data=([val_features_video_holdout, val_features_audio_holdout_inter], val_labels_video_holdout),
    epochs=20,
    batch_size=256,
    shuffle=True
)
plot_history(combined_history_holdout)

# Model evaluation
test_predictions_holdout_proba_common = combined_model_holdout.predict([test_features_video_holdout, test_features_audio_holdout_inter])
test_predictions_holdout_common = (test_predictions_holdout_proba_common > 0.5).astype(int)
 
# Metric calculation
accuracy_holdout_common = accuracy_score(test_labels_video_holdout, test_predictions_holdout_common)
precision_holdout_common = precision_score(test_labels_video_holdout, test_predictions_holdout_common, zero_division=0, average=None)
recall_holdout_common = recall_score(test_labels_video_holdout, test_predictions_holdout_common, zero_division=0, average=None)
f1_holdout_common = f1_score(test_labels_video_holdout, test_predictions_holdout_common, average=None)
auc_holdout_common = roc_auc_score(test_labels_video_holdout, test_predictions_holdout_proba_common)

# Displaying results
print("***** FRAME LEVEL RESULTS *****\n")
conf_matrix_holdout_common = confusion_matrix(test_labels_video_holdout, test_predictions_holdout_common)
plot_confusion_matrix(conf_matrix_holdout_common)
 
# Saving metrics in the dictionary
frame_metrics_holdout["Accuracy"].append(accuracy_holdout_common)
frame_metrics_holdout["Precision (0)"].append(precision_holdout_common[0])
frame_metrics_holdout["Precision (1)"].append(precision_holdout_common[1])
frame_metrics_holdout["Recall (0)"].append(recall_holdout_common[0])
frame_metrics_holdout["Recall (1)"].append(recall_holdout_common[1])
frame_metrics_holdout["F1 (0)"].append(f1_holdout_common[0])
frame_metrics_holdout["F1 (1)"].append(f1_holdout_common[1])
frame_metrics_holdout["AUC"].append(auc_holdout_common)

# Create a dataframe with predictions and video ids
pred_df = pd.DataFrame({'video_id': test_video_names_holdout, 'prediction': test_predictions_holdout_proba_common.flatten(), 'label': test_labels_video_holdout})
 
# Group by video and calculate the average
video_predictions_mean = pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_mean).values
video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)

# Group by video and apply the majority voting rule
video_predictions_binary_majority = pred_df.groupby('video_id', sort=False)['prediction'].apply(calculate_majority).values

# Group by video and apply the aggregation rule
video_predictions_binary_threshold = pred_df.groupby('video_id', sort=False)['prediction'].apply(aggregate_by_threshold).values    

# Group by video and get labels
video_labels = pred_df.groupby('video_id', sort=False)['label'].first().values

# Calculate video-level metrics
video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_mean)

video_metrics_holdout_mean["Accuracy"].append(video_accuracy)
video_metrics_holdout_mean["Precision (0)"].append(video_precision[0])
video_metrics_holdout_mean["Precision (1)"].append(video_precision[1])
video_metrics_holdout_mean["Recall (0)"].append(video_recall[0])
video_metrics_holdout_mean["Recall (1)"].append(video_recall[1])
video_metrics_holdout_mean["F1 (0)"].append(video_f1[0])
video_metrics_holdout_mean["F1 (1)"].append(video_f1[1])
video_metrics_holdout_mean["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MEAN) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_mean)

# Calculate video-level metrics
video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

video_metrics_holdout_majority["Accuracy"].append(video_accuracy)
video_metrics_holdout_majority["Precision (0)"].append(video_precision[0])
video_metrics_holdout_majority["Precision (1)"].append(video_precision[1])
video_metrics_holdout_majority["Recall (0)"].append(video_recall[0])
video_metrics_holdout_majority["Recall (1)"].append(video_recall[1])
video_metrics_holdout_majority["F1 (0)"].append(video_f1[0])
video_metrics_holdout_majority["F1 (1)"].append(video_f1[1])
video_metrics_holdout_majority["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_majority)

# Calculate video-level metrics
video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

video_metrics_holdout_threshold["Accuracy"].append(video_accuracy)
video_metrics_holdout_threshold["Precision (0)"].append(video_precision[0])
video_metrics_holdout_threshold["Precision (1)"].append(video_precision[1])
video_metrics_holdout_threshold["Recall (0)"].append(video_recall[0])
video_metrics_holdout_threshold["Recall (1)"].append(video_recall[1])
video_metrics_holdout_threshold["F1 (0)"].append(video_f1[0])
video_metrics_holdout_threshold["F1 (1)"].append(video_f1[1])
video_metrics_holdout_threshold["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_threshold)

# Create dataframe
df = pd.DataFrame.from_dict(frame_metrics_holdout, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_holdout = {key: [] for key in frame_metrics_holdout}

# Create dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_mean, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_holdout_mean = {key: [] for key in video_metrics_holdout_mean}

# Create dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_majority, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_holdout_majority = {key: [] for key in video_metrics_holdout_majority}

# Create dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_threshold, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_holdout_threshold = {key: [] for key in video_metrics_holdout_threshold}

### Cross-Validation

In [None]:
def cross_validation_video_audio_combined(frame_metrics_dict, video_metrics_dict_mean,
                                          video_metrics_dict_majority, video_metrics_dict_threshold,
                                          data_video, data_video_pad, data_audio, data_audio_pad,
                                          labels, labels_pad, names_video, names_video_pad,
                                          num_folds=4):
    # Performance List Videos
    frame_accuracy_score = [] 
    frame_f1_score_0 = [] 
    frame_precision_score_0 = [] 
    frame_recall_score_0 = []
    frame_f1_score_1 = [] 
    frame_precision_score_1 = [] 
    frame_recall_score_1 = []
    frame_auc_score = []
    
    # Performance List Videos
    video_accuracy_score_mean = [] 
    video_f1_score_mean_0 = [] 
    video_precision_score_mean_0 = [] 
    video_recall_score_mean_0 = []
    video_f1_score_mean_1 = [] 
    video_precision_score_mean_1 = [] 
    video_recall_score_mean_1 = []
    video_auc_score_mean = []

    # Performance List Videos
    video_accuracy_score_majority = [] 
    video_f1_score_majority_0 = [] 
    video_precision_score_majority_0 = [] 
    video_recall_score_majority_0 = []
    video_f1_score_majority_1 = [] 
    video_precision_score_majority_1 = [] 
    video_recall_score_majority_1 = []
    video_auc_score_majority = []

    # Performance List Videos
    video_accuracy_score_threshold = [] 
    video_f1_score_threshold_0 = [] 
    video_precision_score_threshold_0 = [] 
    video_recall_score_threshold_0 = []
    video_f1_score_threshold_1 = [] 
    video_precision_score_threshold_1 = [] 
    video_recall_score_threshold_1 = []
    video_auc_score_threshold = []
    
    for i in range(num_folds):
        print(f'\nFold {i+1}/{num_folds}')
        
        # Fold splitting in train and test
        test_video, test_audio, test_labels, test_video_names = data_video_pad[i], data_audio_pad[i], labels_pad[i], names_video_pad[i]
        train_video = np.array([item for idx, fold in enumerate(data_video) if idx != i for item in fold])
        train_audio = np.array([item for idx, fold in enumerate(data_audio) if idx != i for item in fold])
        train_labels = np.array([label for idx, fold in enumerate(labels) if idx != i for label in fold])
        train_video_names = np.array([item for idx, fold in enumerate(names_video) if idx != i for item in fold])
        
        print(train_video.shape, train_audio.shape, train_labels.shape)
        print(test_video.shape, test_audio.shape, test_labels.shape)
    
        # Model creation
        combined_model_holdout = create_combined_model(train_video.shape[1:],
                                               train_audio.shape[1],
                                               train_audio.shape[2]
                                              )
        
        # Model training
        combined_history_holdout = combined_model_holdout.fit(
            [train_video, train_audio], train_labels,
            epochs=20,
            batch_size=256,
            shuffle=True,
            verbose=0
        )
        
        # Fold evaluation
        predictions_proba = combined_model_holdout.predict([test_video, test_audio])
        predictions = (predictions_proba > 0.5).astype(int)  
        accuracy = accuracy_score(test_labels, predictions)
        precision = precision_score(test_labels, predictions, zero_division=0, average=None)
        recall = recall_score(test_labels, predictions, zero_division=0, average=None)
        f1 = f1_score(test_labels, predictions, average=None)
        auc = roc_auc_score(test_labels, predictions_proba)
        
        # Saving metrics
        frame_accuracy_score.append(accuracy)
        frame_precision_score_0.append(precision[0])
        frame_recall_score_0.append(recall[0])
        frame_f1_score_0.append(f1[0])
        frame_precision_score_1.append(precision[1])
        frame_recall_score_1.append(recall[1])
        frame_f1_score_1.append(f1[1])
        frame_auc_score.append(auc)

        # Create a dataframe with predictions and video ids
        pred_df = pd.DataFrame({'video_id': test_video_names, 'prediction': predictions_proba.flatten(), 'label': test_labels})

        # Group by video and calculate the average
        video_predictions_mean = pred_df.groupby('video_id')['prediction'].apply(calculate_mean).values
        video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)
        
        # Group by video and apply the majority voting rule
        video_predictions_binary_majority = pred_df.groupby('video_id')['prediction'].apply(calculate_majority).values
    
        # Group by video and apply the aggregation rule
        video_predictions_binary_threshold = pred_df.groupby('video_id')['prediction'].apply(aggregate_by_threshold).values    
        
        # Group by video and get labels
        video_labels = pred_df.groupby('video_id')['label'].first().values

        # Calculate metrics for videos
        mean_video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
        mean_video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_auc = roc_auc_score(video_labels, video_predictions_mean)

        majority_video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
        majority_video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

        threshold_video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
        threshold_video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

        # Saving metrics
        video_accuracy_score_mean.append(mean_video_accuracy)
        video_f1_score_mean_0.append(mean_video_f1[0])
        video_precision_score_mean_0.append(mean_video_precision[0])
        video_recall_score_mean_0.append(mean_video_recall[0])
        video_f1_score_mean_1.append(mean_video_f1[1])
        video_precision_score_mean_1.append(mean_video_precision[1])
        video_recall_score_mean_1.append(mean_video_recall[1])
        video_auc_score_mean.append(mean_video_auc)

        video_accuracy_score_majority.append(majority_video_accuracy)
        video_f1_score_majority_0.append(majority_video_f1[0])
        video_precision_score_majority_0.append(majority_video_precision[0])
        video_recall_score_majority_0.append(majority_video_recall[0])
        video_f1_score_majority_1.append(majority_video_f1[1])
        video_precision_score_majority_1.append(majority_video_precision[1])
        video_recall_score_majority_1.append(majority_video_recall[1])
        video_auc_score_majority.append(majority_video_auc)

        video_accuracy_score_threshold.append(threshold_video_accuracy)
        video_f1_score_threshold_0.append(threshold_video_f1[0])
        video_precision_score_threshold_0.append(threshold_video_precision[0])
        video_recall_score_threshold_0.append(threshold_video_recall[0])
        video_f1_score_threshold_1.append(threshold_video_f1[1])
        video_precision_score_threshold_1.append(threshold_video_precision[1])
        video_recall_score_threshold_1.append(threshold_video_recall[1])
        video_auc_score_threshold.append(threshold_video_auc)

        print(f"Accuracy fold (frame-based): {accuracy}")
        print(f"Accuracy fold (video-based-mean): {mean_video_accuracy}")
        print(f"Accuracy fold (video-based-majority): {majority_video_accuracy}")
        print(f"Accuracy fold (video-based-threshold): {threshold_video_accuracy}")
    
    # Metrics average on all frames
    frame_avg_accuracy = np.mean(frame_accuracy_score)
    frame_avg_precision_0, frame_avg_recall_0, frame_avg_f1_0 = np.mean(frame_precision_score_0), np.mean(frame_recall_score_0), np.mean(frame_f1_score_0)
    frame_avg_precision_1, frame_avg_recall_1, frame_avg_f1_1 = np.mean(frame_precision_score_1), np.mean(frame_recall_score_1), np.mean(frame_f1_score_1)
    frame_avg_auc = np.mean(frame_auc_score)

    frame_metrics_dict["Mean Accuracy"].append(frame_avg_accuracy)
    frame_metrics_dict["Mean Precision (0)"].append(frame_avg_precision_0)
    frame_metrics_dict["Mean Precision (1)"].append(frame_avg_precision_1)
    frame_metrics_dict["Mean Recall (0)"].append(frame_avg_recall_0)
    frame_metrics_dict["Mean Recall (1)"].append(frame_avg_recall_1)
    frame_metrics_dict["Mean F1 (0)"].append(frame_avg_f1_0)
    frame_metrics_dict["Mean F1 (1)"].append(frame_avg_f1_1)
    frame_metrics_dict["Mean AUC"].append(frame_avg_auc)

    # Metrics standard deviation on all frames
    frame_std_accuracy = np.std(frame_accuracy_score)
    frame_std_precision_0, frame_std_recall_0, frame_std_f1_0 = np.std(frame_precision_score_0), np.std(frame_recall_score_0), np.std(frame_f1_score_0)
    frame_std_precision_1, frame_std_recall_1, frame_std_f1_1 = np.std(frame_precision_score_1), np.std(frame_recall_score_1), np.std(frame_f1_score_1)
    frame_std_auc = np.std(frame_auc_score)

    frame_metrics_dict["Std Accuracy"].append(frame_std_accuracy)
    frame_metrics_dict["Std Precision (0)"].append(frame_std_precision_0)
    frame_metrics_dict["Std Precision (1)"].append(frame_std_precision_1)
    frame_metrics_dict["Std Recall (0)"].append(frame_std_recall_0)
    frame_metrics_dict["Std Recall (1)"].append(frame_std_recall_1)
    frame_metrics_dict["Std F1 (0)"].append(frame_std_f1_0)
    frame_metrics_dict["Std F1 (1)"].append(frame_std_f1_1)
    frame_metrics_dict["Std AUC"].append(frame_std_auc)
        
    print("\n\nRESULTS on FRAMES:\n")

    print("Mean Accuracy:", frame_avg_accuracy)
    print("Accuracy std:", frame_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_mean)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_mean_0), np.mean(video_recall_score_mean_0), np.mean(video_f1_score_mean_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_mean_1), np.mean(video_recall_score_mean_1), np.mean(video_f1_score_mean_1)
    video_avg_auc = np.mean(video_auc_score_mean)

    video_metrics_dict_mean["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_mean["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_mean["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_mean["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_mean["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_mean["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_mean["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_mean["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_mean)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_mean_0), np.std(video_recall_score_mean_0), np.std(video_f1_score_mean_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_mean_1), np.std(video_recall_score_mean_1), np.std(video_f1_score_mean_1)
    video_std_auc = np.std(video_auc_score_mean)

    video_metrics_dict_mean["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_mean["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_mean["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_mean["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_mean["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_mean["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_mean["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_mean["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MEAN):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_majority)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_majority_0), np.mean(video_recall_score_majority_0), np.mean(video_f1_score_majority_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_majority_1), np.mean(video_recall_score_majority_1), np.mean(video_f1_score_majority_1)
    video_avg_auc = np.mean(video_auc_score_majority)

    video_metrics_dict_majority["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_majority["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_majority["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_majority["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_majority["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_majority["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_majority["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_majority["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_majority)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_majority_0), np.std(video_recall_score_majority_0), np.std(video_f1_score_majority_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_majority_1), np.std(video_recall_score_majority_1), np.std(video_f1_score_majority_1)
    video_std_auc = np.std(video_auc_score_majority)

    video_metrics_dict_majority["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_majority["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_majority["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_majority["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_majority["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_majority["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_majority["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_majority["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MAJORITY):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_threshold)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_threshold_0), np.mean(video_recall_score_threshold_0), np.mean(video_f1_score_threshold_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_threshold_1), np.mean(video_recall_score_threshold_1), np.mean(video_f1_score_threshold_1)
    video_avg_auc = np.mean(video_auc_score_threshold)

    video_metrics_dict_threshold["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_threshold["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_threshold["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_threshold["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_threshold["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_threshold["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_threshold["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_threshold["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_threshold)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_threshold_0), np.std(video_recall_score_threshold_0), np.std(video_f1_score_threshold_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_threshold_1), np.std(video_recall_score_threshold_1), np.std(video_f1_score_threshold_1)
    video_std_auc = np.std(video_auc_score_threshold)

    video_metrics_dict_threshold["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_threshold["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_threshold["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_threshold["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_threshold["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_threshold["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_threshold["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_threshold["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (THRESHOLD):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

In [None]:
cross_validation_video_audio_combined(frame_metrics_cross, video_metrics_cross_mean,
                                          video_metrics_cross_majority, video_metrics_cross_threshold,
                                         data_video_cross, data_video_cross_pad,
                                         data_audio_cross_inter, data_audio_cross_pad_inter,
                                         labels_video_cross, labels_video_cross_pad,
                                         names_video_cross, names_video_cross_pad,
                                         num_folds=4)

# Dataframe creation
df = pd.DataFrame.from_dict(frame_metrics_cross, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_cross = {key: [] for key in frame_metrics_cross}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_mean, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_cross_mean = {key: [] for key in video_metrics_cross_mean}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_majority, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_cross_majority = {key: [] for key in video_metrics_cross_majority}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_threshold, orient='index', columns=[f"INTER FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_cross_threshold = {key: [] for key in video_metrics_cross_threshold}

## Early Fusion:
We extract sequences of 64 or 20 frames from each video, for a total of 20(aux) + 7(emotions) = 27 features for each frame. Next, we extract the audio corresponding to that part of the video and organize it too into sequences of 64x43 or 20x43. Finally, we combine the two types of features into a vector of shape (num_seq,70) and train a unique model.

### features Video

In [None]:
sequence_length = 20 # or 64
frame_step = 1
cols_groups = [aus_cols_aux+aus_cols_emotions]
padding = 'const'

# Generate sequences for train, val and test
seq_gen_train = list(list(gen_sequence(train_dataset[train_dataset['input'] == id], 
                                       sequence_length, cols_groups, frame_step, 
                                       padding=None))
                     for id in train_list)

seq_gen_val = list(list(gen_sequence(val_dataset[val_dataset['input'] == id], 
                                     sequence_length, cols_groups, frame_step, 
                                     padding))
                   for id in val_list)

seq_gen_test = list(list(gen_sequence(test_dataset[test_dataset['input'] == id], 
                                      sequence_length, cols_groups, frame_step, 
                                      padding))
                    for id in test_list)

# Remove empty lists
seq_gen_train = [x for x in seq_gen_train if len(x) > 0]
seq_gen_val = [x for x in seq_gen_val if len(x) > 0]
seq_gen_test = [x for x in seq_gen_test if len(x) > 0]

# Extract data from generators
seq_array_train = [[t[0] for t in sublist] for sublist in seq_gen_train]
label_array_train = [[t[1] for t in sublist] for sublist in seq_gen_train]
video_array_train = [[t[2] for t in sublist] for sublist in seq_gen_train]

seq_array_val = [[t[0] for t in sublist] for sublist in seq_gen_val]
label_array_val = [[t[1] for t in sublist] for sublist in seq_gen_val]
video_array_val = [[t[2] for t in sublist] for sublist in seq_gen_val]

seq_array_test = [[t[0] for t in sublist] for sublist in seq_gen_test]
label_array_test = [[t[1] for t in sublist] for sublist in seq_gen_test]
video_array_test = [[t[2] for t in sublist] for sublist in seq_gen_test]

# Transforms lists in arrays
seq_array_train = np.concatenate(seq_array_train).astype(np.float32)
train_labels_video_holdout_early = np.concatenate(label_array_train).astype(np.float32).reshape(-1)
train_video_names_holdout_early = np.concatenate(video_array_train)

seq_array_val = np.concatenate(seq_array_val).astype(np.float32)
val_labels_video_holdout_early = np.concatenate(label_array_val).astype(np.float32).reshape(-1)
val_video_names_holdout_early = np.concatenate(video_array_val)

seq_array_test = np.concatenate(seq_array_test).astype(np.float32)
test_labels_video_holdout_early = np.concatenate(label_array_test).astype(np.float32).reshape(-1)
test_video_names_holdout_early = np.concatenate(video_array_test)

# Transpose and expand arrays
train_features_video_holdout_early = np.transpose(seq_array_train, (0, 1, 3, 2))
train_features_video_holdout_early = np.squeeze(train_features_video_holdout_early, axis=-1)
print(train_features_video_holdout_early.shape, train_labels_video_holdout_early.shape)

val_features_video_holdout_early = np.transpose(seq_array_val, (0, 1, 3, 2))
val_features_video_holdout_early = np.squeeze(val_features_video_holdout_early, axis=-1)
print(val_features_video_holdout_early.shape, val_labels_video_holdout_early.shape)

test_features_video_holdout_early = np.transpose(seq_array_test, (0, 1, 3, 2))
test_features_video_holdout_early = np.squeeze(test_features_video_holdout_early, axis=-1)
print(test_features_video_holdout_early.shape, test_labels_video_holdout_early.shape)

new_train_list_early, indices = np.unique(train_video_names_holdout_early, return_index=True)
new_train_list_early = new_train_list_early[np.argsort(indices)].tolist()
new_val_list_early, indices = np.unique(val_video_names_holdout_early, return_index=True)
new_val_list_early = new_val_list_early[np.argsort(indices)].tolist()
new_test_list_early, indices = np.unique(test_video_names_holdout_early, return_index=True)
new_test_list_early = new_test_list_early[np.argsort(indices)].tolist()

In [None]:
data_video_cross_early, labels_video_cross_early, names_video_cross_early = [],[],[]
data_video_cross_pad_early, labels_video_cross_pad_early, names_video_cross_pad_early = [],[],[]
new_cross_list_early, new_cross_list_pad_early = [],[]

for i in range(4):
    fold_dataset = cross_video_datasets[i]
    fold_list = cross_video_lists[i]
    
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding=None))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transform lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.squeeze(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    data_video_cross_early.append(fold_sequences)
    labels_video_cross_early.append(fold_labels)
    names_video_cross_early.append(fold_video_names)

    fold_video_names, indices = np.unique(fold_video_names, return_index=True)
    fold_video_names = fold_video_names[np.argsort(indices)].tolist()
    new_cross_list_early.append(fold_video_names)

    # Padding
    seq_gen_fold = list(list(gen_sequence(fold_dataset[fold_dataset['input'] == id], 
                                          sequence_length, cols_groups, frame_step, 
                                          padding))
                 for id in fold_list)
    seq_gen_fold = [x for x in seq_gen_fold if len(x)>0]

    # Extract data from generators
    seq_array_fold = [[t[0] for t in sublist] for sublist in seq_gen_fold]
    label_array_fold = [[t[1] for t in sublist] for sublist in seq_gen_fold]
    video_array_fold = [[t[2] for t in sublist] for sublist in seq_gen_fold]
    
    # Transforms lists in arrays
    fold_sequences = np.concatenate(seq_array_fold).astype(np.float32)
    fold_labels = np.concatenate(label_array_fold).astype(np.float32).reshape(-1)
    fold_video_names = np.concatenate(video_array_fold)

    # Transpose and expand
    fold_sequences = np.transpose(fold_sequences, (0, 1, 3, 2))
    fold_sequences = np.squeeze(fold_sequences, axis=-1)
    
    print(fold_sequences.shape, fold_labels.shape)    
    
    data_video_cross_pad_early.append(fold_sequences)
    labels_video_cross_pad_early.append(fold_labels)
    names_video_cross_pad_early.append(fold_video_names)

    fold_video_names, indices = np.unique(fold_video_names, return_index=True)
    fold_video_names = fold_video_names[np.argsort(indices)].tolist()
    new_cross_list_pad_early.append(fold_video_names)

### features Audio
The commented parts were used to extract the new features, which were then downloaded so that the extraction process would not have to be repeated.

In [None]:
def load_audio_data_segmented(audio_dataset_path, video_dataset, seq_len=sequence_length*frame_step, frame_step=frame_step, padding=None):
    audios = []
    audio_labels = []
    audio_names_list = []

    # Extract audio data
    for root, dirs, files in os.walk(audio_dataset_path):
        for file in files:
            if file.endswith('.xlsx'):
                
                # Path of the Excel file
                excel_path = os.path.join(root, file)
                df = pd.read_excel(excel_path)
                
                # Extract audio names, video names, and labels
                label_list = df['label'].tolist()
                audio_names = df['audio name'].tolist()
                video_names = df['video name'].tolist()
                
                # Segment the corresponding audio files
                for label, audio_name, video_name in zip(label_list, audio_names, video_names):                    
                    audio_path = os.path.join(audio_dataset_path, 'Audio', audio_name)
                    video_path = os.path.join(audio_dataset_path, 'Statements', video_name)
                    frame_dataset = video_dataset[video_dataset['input'] == video_path]
                    total_frames = len(frame_dataset)
                    
                    if os.path.exists(audio_path):
                        signal, sample_rate = librosa.load(audio_path, sr=None)  # Load audio with original sampling rate
                        total_duration_ms = librosa.get_duration(y=signal, sr=sample_rate) * 1000  # Total audio duration in milliseconds
                        frame_rate = total_frames / total_duration_ms
                        seq_duration_ms = seq_len / frame_rate
                        
                        # Extract audio segments corresponding to video sequences
                        segments = []
                        start_time_ms = 0
                        while np.round(start_time_ms + seq_duration_ms, 3) <= np.round(total_duration_ms, 3):
                            start_sample = int((start_time_ms / 1000) * sample_rate)
                            end_sample = int(((start_time_ms + seq_duration_ms) / 1000) * sample_rate)
                            segments.append(signal[start_sample:end_sample])
                            start_time_ms += frame_step / frame_rate
                            
                        if not segments and padding:
                            segments.append(signal)

                        for segment in segments:
                            audios.append((segment, sample_rate))
                            audio_labels.append(label)
                            audio_names_list.append(audio_name)
                    else:
                        print(f"Audio file not found: {audio_path}")
    
    return audios, np.array(audio_labels), np.array(audio_names_list)

# Load data from dataset
train_audio_holdout_early, train_labels_audio_holdout_early, train_audio_names_holdout_early = load_audio_data_segmented(train_audio_path, train_dataset, sequence_length*frame_step, frame_step, padding=None)
val_audio_holdout_early, val_labels_audio_holdout_early, val_audio_names_holdout_early = load_audio_data_segmented(val_audio_path, val_dataset, sequence_length*frame_step, frame_step, padding)
test_audio_holdout_early, test_labels_audio_holdout_early, test_audio_names_holdout_early = load_audio_data_segmented(test_audio_path, test_dataset, sequence_length*frame_step, frame_step, padding)

"""# Feature extraction for each audio
train_features_audio_holdout_early = audio_features_extract(train_audio_holdout_early, num_segments=20)
val_features_audio_holdout_early = audio_features_extract(val_audio_holdout_early, num_segments=20)
test_features_audio_holdout_early = audio_features_extract(test_audio_holdout_early, num_segments=20)

# Print dimensions
print(train_features_audio_holdout_early.shape, train_labels_audio_holdout_early.shape)
print(val_features_audio_holdout_early.shape, val_labels_audio_holdout_early.shape)
print(test_features_audio_holdout_early.shape, test_labels_audio_holdout_early.shape)"""

fold_dirs = [f"{cross_audio_path}/fold_{index}" for index in range(4)]
data_audio_cross_early, labels_audio_cross_early, names_audio_cross_early = [], [], []
data_audio_cross_pad_early, labels_audio_cross_pad_early, names_audio_cross_pad_early = [], [], []
 
# Loading and feature extraction for each fold
for i in range(4):
    print(f"Processing fold {i}...")

    # No padding
    fold_audio, fold_audio_labels, fold_audio_names = load_audio_data_segmented(fold_dirs[i], cross_video_datasets[i], padding=None)
    #fold_audio_features = audio_features_extract(fold_audio, num_segments=20)
    #print(fold_audio_features.shape, fold_audio_labels.shape)
    #data_audio_cross_early.append(fold_audio_features)
    #labels_audio_cross_early.append(fold_audio_labels)
    names_audio_cross_early.append(fold_audio_names)

    # Padding
    fold_audio, fold_audio_labels, fold_audio_names = load_audio_data_segmented(fold_dirs[i], cross_video_datasets[i], padding=padding)
    #fold_audio_features = audio_features_extract(fold_audio, num_segments=20)
    #print(fold_audio_features.shape, fold_audio_labels.shape)
    #data_audio_cross_pad_early.append(fold_audio_features)
    #labels_audio_cross_pad_early.append(fold_audio_labels)
    names_audio_cross_pad_early.append(fold_audio_names)

"""os.makedirs("/kaggle/working/early_array", exist_ok=True)

# Save the arrays for later use
np.save("early_array/train_features_audio.npy", train_features_audio_holdout_early)
np.save("early_array/val_features_audio.npy", val_features_audio_holdout_early)
np.save("early_array/test_features_audio.npy", test_features_audio_holdout_early)
np.save("early_array/train_labels_audio.npy", train_labels_audio_holdout_early)
np.save("early_array/val_labels_audio.npy", val_labels_audio_holdout_early)
np.save("early_array/test_labels_audio.npy", test_labels_audio_holdout_early)
for i in range(4):
    np.save(f"early_array/fold_{i}_features_audio.npy", data_audio_cross_early[i])
    np.save(f"early_array/fold_{i}_labels_audio.npy", labels_audio_cross_early[i])
    np.save(f"early_array/fold_{i}_features_audio_pad.npy", data_audio_cross_pad_early[i])
    np.save(f"early_array/fold_{i}_labels_audio_pad.npy", labels_audio_cross_pad_early[i])

import shutil

# Path to the dataset folder
dataset_dir = "/kaggle/working/early_array"
 
# Path to save the zip file in the Kaggle output directory
output_zip = "/kaggle/working/early_array.zip"
 
# Create the zip file
shutil.make_archive(output_zip.replace(".zip", ""), 'zip', dataset_dir)
 
print(f"The dataset has been compressed into {output_zip}. You can download it from the Kaggle output.")#"""

In [None]:
num_segments = 20
features_audio_input_path = f"/kaggle/input/truthlie-audio-features/truthlie_audio_original_{sequence_length}_frame_{num_segments}_30"

# Load previously saved arrays
train_features_audio_holdout_early = np.load(features_audio_input_path + "/train_features_audio.npy")
val_features_audio_holdout_early = np.load(features_audio_input_path + "/val_features_audio.npy")
test_features_audio_holdout_early = np.load(features_audio_input_path + "/test_features_audio.npy")
train_labels_audio_holdout_early = np.load(features_audio_input_path + "/train_labels_audio.npy")
val_labels_audio_holdout_early = np.load(features_audio_input_path + "/val_labels_audio.npy")
test_labels_audio_holdout_early = np.load(features_audio_input_path + "/test_labels_audio.npy")
print(train_features_audio_holdout_early.shape, train_labels_audio_holdout_early.shape)
print(val_features_audio_holdout_early.shape, val_labels_audio_holdout_early.shape)
print(test_features_audio_holdout_early.shape, test_labels_audio_holdout_early.shape)

data_audio_cross_early, labels_audio_cross_early = [], []
data_audio_cross_pad_early, labels_audio_cross_pad_early = [], []
for i in range(4):
    data_audio_cross_early.append(np.load(features_audio_input_path + f"/fold_{i}_features_audio.npy"))
    labels_audio_cross_early.append(np.load(features_audio_input_path + f"/fold_{i}_labels_audio.npy"))
    print(data_audio_cross_early[i].shape, labels_audio_cross_early[i].shape)

    data_audio_cross_pad_early.append(np.load(features_audio_input_path + f"/fold_{i}_features_audio_pad.npy"))
    labels_audio_cross_pad_early.append(np.load(features_audio_input_path + f"/fold_{i}_labels_audio_pad.npy"))
    print(data_audio_cross_pad_early[i].shape, labels_audio_cross_pad_early[i].shape)

In [None]:
def get_new_audio_array_early(features_list,labels_list,audios_list,video_list):
    new_audio_dict_early_features = {}
    new_audio_dict_early_labels = {}
    
    for feature,label,name in zip(features_list,labels_list,audios_list):
        real_name = name[:-4]
        if real_name not in new_audio_dict_early_features.keys():
            new_audio_dict_early_features[real_name] = []
            new_audio_dict_early_labels[real_name] = []
        new_audio_dict_early_features[real_name].append(feature)
        new_audio_dict_early_labels[real_name].append(label)
    
    new_audio_test_features = []
    new_audio_test_labels = []
    for name in video_list:
        real_name = name.split('/')[-1][:-4]
        new_audio_test_features.extend(new_audio_dict_early_features[real_name])
        new_audio_test_labels.extend(new_audio_dict_early_labels[real_name])
    
    return np.array(new_audio_test_features), np.array(new_audio_test_labels)

train_features_audio_holdout_early, train_labels_audio_holdout_early = get_new_audio_array_early(train_features_audio_holdout_early,train_labels_audio_holdout_early,train_audio_names_holdout_early,new_train_list_early)
print(train_features_audio_holdout_early.shape, train_labels_audio_holdout_early.shape)
val_features_audio_holdout_early, val_labels_audio_holdout_early = get_new_audio_array_early(val_features_audio_holdout_early,val_labels_audio_holdout_early,val_audio_names_holdout_early,new_val_list_early)
print(val_features_audio_holdout_early.shape, val_labels_audio_holdout_early.shape)
test_features_audio_holdout_early, test_labels_audio_holdout_early = get_new_audio_array_early(test_features_audio_holdout_early,test_labels_audio_holdout_early,test_audio_names_holdout_early,new_test_list_early)
print(test_features_audio_holdout_early.shape, test_labels_audio_holdout_early.shape)
for i in range(4):
    fold_features_audio_early, fold_labels_audio_early = get_new_audio_array_early(data_audio_cross_early[i],labels_audio_cross_early[i],names_audio_cross_early[i],new_cross_list_early[i])
    print(fold_features_audio_early.shape, fold_labels_audio_early.shape)
    data_audio_cross_early[i]=fold_features_audio_early
    labels_audio_cross_early[i]=fold_labels_audio_early

    fold_features_audio_early, fold_labels_audio_early = get_new_audio_array_early(data_audio_cross_pad_early[i],labels_audio_cross_pad_early[i],names_audio_cross_pad_early[i],new_cross_list_pad_early[i])
    print(fold_features_audio_early.shape, fold_labels_audio_early.shape)
    data_audio_cross_pad_early[i]=fold_features_audio_early
    labels_audio_cross_pad_early[i]=fold_labels_audio_early

In [None]:
print(all(train_labels_audio_holdout_early == train_labels_video_holdout_early))
print(all(val_labels_audio_holdout_early == val_labels_video_holdout_early))
print(all(test_labels_audio_holdout_early == test_labels_video_holdout_early))

for i in range(4):
    print(all(labels_audio_cross_early[i] == labels_video_cross_early[i]))
    print(all(labels_audio_cross_pad_early[i] == labels_video_cross_pad_early[i]))

### Model

In [None]:
def combine_features(features_video, features_audio):
    return np.concatenate([features_video, features_audio], axis=-1)

def create_lstm_model(input_shape, hidden_size=128, learning_rate=0.001):
    model = Sequential([
        InputLayer(shape=input_shape),
        Masking(mask_value=0.0),
        LSTM(hidden_size, return_sequences=False, use_cudnn=False),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
train_features_combined_holdout = combine_features(train_features_video_holdout_early, train_features_audio_holdout_early)
val_features_combined_holdout = combine_features(val_features_video_holdout_early, val_features_audio_holdout_early)
test_features_combined_holdout = combine_features(test_features_video_holdout_early, test_features_audio_holdout_early)

print(train_features_combined_holdout.shape, train_labels_video_holdout_early.shape)
print(val_features_combined_holdout.shape, val_labels_video_holdout_early.shape)
print(test_features_combined_holdout.shape, test_labels_video_holdout_early.shape)

In [None]:
# Model creation
combined_model_2_holdout = create_lstm_model(
    input_shape=(train_features_combined_holdout.shape[1], train_features_combined_holdout.shape[2]),
                 hidden_size=128, learning_rate=0.001)

# Model training
combined_history_2_holdout = combined_model_2_holdout.fit(
    train_features_combined_holdout, train_labels_video_holdout_early,
    validation_data=(val_features_combined_holdout, val_labels_video_holdout_early),
    epochs=20,
    batch_size=32,
    verbose=1
)

plot_history(combined_history_2_holdout)

# Model evaluation
test_predictions_holdout_proba_common_2 = combined_model_2_holdout.predict(test_features_combined_holdout)
test_predictions_holdout_common_2 = (test_predictions_holdout_proba_common_2 > 0.5).astype(int)
 
# Calculation of metrics
accuracy_holdout_common_2 = accuracy_score(test_labels_video_holdout_early, test_predictions_holdout_common_2)
precision_holdout_common_2 = precision_score(test_labels_video_holdout_early, test_predictions_holdout_common_2, zero_division=0, average=None)
recall_holdout_common_2 = recall_score(test_labels_video_holdout_early, test_predictions_holdout_common_2, zero_division=0, average=None)
f1_holdout_common_2 = f1_score(test_labels_video_holdout_early, test_predictions_holdout_common_2, average=None)
auc_holdout_common_2 = roc_auc_score(test_labels_video_holdout_early, test_predictions_holdout_proba_common_2)

# Displaying results
print("***** SEQUENCE LEVEL RESULTS *****\n")
conf_matrix_holdout_common_2 = confusion_matrix(test_labels_video_holdout_early, test_predictions_holdout_common_2)
plot_confusion_matrix(conf_matrix_holdout_common_2)
 
# Saving metrics in dictionary
frame_metrics_holdout["Accuracy"].append(accuracy_holdout_common_2)
frame_metrics_holdout["Precision (0)"].append(precision_holdout_common_2[0])
frame_metrics_holdout["Precision (1)"].append(precision_holdout_common_2[1])
frame_metrics_holdout["Recall (0)"].append(recall_holdout_common_2[0])
frame_metrics_holdout["Recall (1)"].append(recall_holdout_common_2[1])
frame_metrics_holdout["F1 (0)"].append(f1_holdout_common_2[0])
frame_metrics_holdout["F1 (1)"].append(f1_holdout_common_2[1])
frame_metrics_holdout["AUC"].append(auc_holdout_common_2)

# Create a dataframe with predictions and video ids
pred_df = pd.DataFrame({'video_id': test_video_names_holdout_early, 'prediction': test_predictions_holdout_proba_common_2.flatten(), 'label': test_labels_video_holdout_early})

# Group by video and calculate the average
video_predictions_mean = pred_df.groupby('video_id')['prediction'].apply(calculate_mean).values
video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)

# Group by video and apply the majority voting rule
video_predictions_binary_majority = pred_df.groupby('video_id')['prediction'].apply(calculate_majority).values

# Group by video and apply the aggregation rule
video_predictions_binary_threshold = pred_df.groupby('video_id')['prediction'].apply(aggregate_by_threshold).values    

# Group by video and get labels
video_labels = pred_df.groupby('video_id')['label'].first().values

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_mean)

video_metrics_holdout_mean["Accuracy"].append(video_accuracy)
video_metrics_holdout_mean["Precision (0)"].append(video_precision[0])
video_metrics_holdout_mean["Precision (1)"].append(video_precision[1])
video_metrics_holdout_mean["Recall (0)"].append(video_recall[0])
video_metrics_holdout_mean["Recall (1)"].append(video_recall[1])
video_metrics_holdout_mean["F1 (0)"].append(video_f1[0])
video_metrics_holdout_mean["F1 (1)"].append(video_f1[1])
video_metrics_holdout_mean["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MEAN) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_mean)

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

video_metrics_holdout_majority["Accuracy"].append(video_accuracy)
video_metrics_holdout_majority["Precision (0)"].append(video_precision[0])
video_metrics_holdout_majority["Precision (1)"].append(video_precision[1])
video_metrics_holdout_majority["Recall (0)"].append(video_recall[0])
video_metrics_holdout_majority["Recall (1)"].append(video_recall[1])
video_metrics_holdout_majority["F1 (0)"].append(video_f1[0])
video_metrics_holdout_majority["F1 (1)"].append(video_f1[1])
video_metrics_holdout_majority["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_majority)

# Calculate metrics for video
video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

video_metrics_holdout_threshold["Accuracy"].append(video_accuracy)
video_metrics_holdout_threshold["Precision (0)"].append(video_precision[0])
video_metrics_holdout_threshold["Precision (1)"].append(video_precision[1])
video_metrics_holdout_threshold["Recall (0)"].append(video_recall[0])
video_metrics_holdout_threshold["Recall (1)"].append(video_recall[1])
video_metrics_holdout_threshold["F1 (0)"].append(video_f1[0])
video_metrics_holdout_threshold["F1 (1)"].append(video_f1[1])
video_metrics_holdout_threshold["AUC"].append(video_auc)

print("\n***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
plot_confusion_matrix_2(video_labels, video_predictions_binary_threshold)

# Creation of dataframe
df = pd.DataFrame.from_dict(frame_metrics_holdout, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_holdout = {key: [] for key in frame_metrics_holdout}

# Creation of dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_mean, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_holdout_mean = {key: [] for key in video_metrics_holdout_mean}

# Creation of dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_majority, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_holdout_majority = {key: [] for key in video_metrics_holdout_majority}

# Creation of dataframe
df = pd.DataFrame.from_dict(video_metrics_holdout_threshold, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_holdout_threshold = {key: [] for key in video_metrics_holdout_threshold}

### Cross-Validation

In [None]:
def cross_validation_video_audio_combined_2(frame_metrics_dict, video_metrics_dict_mean,
                                            video_metrics_dict_majority, video_metrics_dict_threshold,
                                            data_video, data_video_pad, data_audio, data_audio_pad,
                                            labels, labels_pad, names_video, names_video_pad,
                                            num_folds=4):
    # Performance List Videos
    frame_accuracy_score = [] 
    frame_f1_score_0 = [] 
    frame_precision_score_0 = [] 
    frame_recall_score_0 = []
    frame_f1_score_1 = [] 
    frame_precision_score_1 = [] 
    frame_recall_score_1 = []
    frame_auc_score = []
    
    # Performance List Videos
    video_accuracy_score_mean = [] 
    video_f1_score_mean_0 = [] 
    video_precision_score_mean_0 = [] 
    video_recall_score_mean_0 = []
    video_f1_score_mean_1 = [] 
    video_precision_score_mean_1 = [] 
    video_recall_score_mean_1 = []
    video_auc_score_mean = []

    # Performance List Videos
    video_accuracy_score_majority = [] 
    video_f1_score_majority_0 = [] 
    video_precision_score_majority_0 = [] 
    video_recall_score_majority_0 = []
    video_f1_score_majority_1 = [] 
    video_precision_score_majority_1 = [] 
    video_recall_score_majority_1 = []
    video_auc_score_majority = []

    # Performance List Videos
    video_accuracy_score_threshold = [] 
    video_f1_score_threshold_0 = [] 
    video_precision_score_threshold_0 = [] 
    video_recall_score_threshold_0 = []
    video_f1_score_threshold_1 = [] 
    video_precision_score_threshold_1 = [] 
    video_recall_score_threshold_1 = []
    video_auc_score_threshold = []
    
    for i in range(num_folds):
        print(f'\nFold {i+1}/{num_folds}')
        
        # Fold splitting in train and test
        test_video, test_audio, test_labels, test_video_names = data_video_pad[i], data_audio_pad[i], labels_pad[i], names_video_pad[i]
        train_video = np.array([item for idx, fold in enumerate(data_video) if idx != i for item in fold])
        train_audio = np.array([item for idx, fold in enumerate(data_audio) if idx != i for item in fold])
        train_labels = np.array([label for idx, fold in enumerate(labels) if idx != i for label in fold])
        train_video_names = np.array([item for idx, fold in enumerate(names_video) if idx != i for item in fold])
    
        # Features combination
        train_combined = combine_features(train_video, train_audio)
        test_combined = combine_features(test_video, test_audio)

        print(train_combined.shape, train_labels.shape)
        print(test_combined.shape, test_labels.shape)
        
        # Model creation
        model = create_lstm_model(
            input_shape=(train_combined.shape[1], train_combined.shape[2]),
                         hidden_size=128, learning_rate=0.001)
        
        # Model training
        history = model.fit(
            train_combined, train_labels,
            epochs=20,
            batch_size=32,
            verbose=0
        )
        
        # Fold evaluation
        predictions_proba = model.predict(test_combined)
        predictions = (predictions_proba > 0.5).astype(int)
        accuracy = accuracy_score(test_labels, predictions)
        precision = precision_score(test_labels, predictions, zero_division=0, average=None)
        recall = recall_score(test_labels, predictions, zero_division=0, average=None)
        f1 = f1_score(test_labels, predictions, average=None)
        auc = roc_auc_score(test_labels, predictions_proba)
        
        # Saving metrics
        frame_accuracy_score.append(accuracy)
        frame_precision_score_0.append(precision[0])
        frame_recall_score_0.append(recall[0])
        frame_f1_score_0.append(f1[0])
        frame_precision_score_1.append(precision[1])
        frame_recall_score_1.append(recall[1])
        frame_f1_score_1.append(f1[1])
        frame_auc_score.append(auc)

        # Create a dataframe with predictions and video ids
        pred_df = pd.DataFrame({'video_id': test_video_names, 'prediction': predictions_proba.flatten(), 'label': test_labels})

        # Group by video and calculate the average
        video_predictions_mean = pred_df.groupby('video_id')['prediction'].apply(calculate_mean).values
        video_predictions_binary_mean = (video_predictions_mean > 0.5).astype(int)
        
        # Group by video and apply the majority voting rule
        video_predictions_binary_majority = pred_df.groupby('video_id')['prediction'].apply(calculate_majority).values
    
        # Group by video and apply the aggregation rule
        video_predictions_binary_threshold = pred_df.groupby('video_id')['prediction'].apply(aggregate_by_threshold).values    
        
        # Group by video and get labels
        video_labels = pred_df.groupby('video_id')['label'].first().values

        # Calculate metrics for videos
        mean_video_accuracy = accuracy_score(video_labels, video_predictions_binary_mean)
        mean_video_f1 = f1_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_precision = precision_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_recall = recall_score(video_labels, video_predictions_binary_mean, zero_division=0, average=None)
        mean_video_auc = roc_auc_score(video_labels, video_predictions_mean)

        majority_video_accuracy = accuracy_score(video_labels, video_predictions_binary_majority)
        majority_video_f1 = f1_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_precision = precision_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_recall = recall_score(video_labels, video_predictions_binary_majority, zero_division=0, average=None)
        majority_video_auc = roc_auc_score(video_labels, video_predictions_binary_majority)

        threshold_video_accuracy = accuracy_score(video_labels, video_predictions_binary_threshold)
        threshold_video_f1 = f1_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_precision = precision_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_recall = recall_score(video_labels, video_predictions_binary_threshold, zero_division=0, average=None)
        threshold_video_auc = roc_auc_score(video_labels, video_predictions_binary_threshold)

        # Saving metrics
        video_accuracy_score_mean.append(mean_video_accuracy)
        video_f1_score_mean_0.append(mean_video_f1[0])
        video_precision_score_mean_0.append(mean_video_precision[0])
        video_recall_score_mean_0.append(mean_video_recall[0])
        video_f1_score_mean_1.append(mean_video_f1[1])
        video_precision_score_mean_1.append(mean_video_precision[1])
        video_recall_score_mean_1.append(mean_video_recall[1])
        video_auc_score_mean.append(mean_video_auc)

        video_accuracy_score_majority.append(majority_video_accuracy)
        video_f1_score_majority_0.append(majority_video_f1[0])
        video_precision_score_majority_0.append(majority_video_precision[0])
        video_recall_score_majority_0.append(majority_video_recall[0])
        video_f1_score_majority_1.append(majority_video_f1[1])
        video_precision_score_majority_1.append(majority_video_precision[1])
        video_recall_score_majority_1.append(majority_video_recall[1])
        video_auc_score_majority.append(majority_video_auc)

        video_accuracy_score_threshold.append(threshold_video_accuracy)
        video_f1_score_threshold_0.append(threshold_video_f1[0])
        video_precision_score_threshold_0.append(threshold_video_precision[0])
        video_recall_score_threshold_0.append(threshold_video_recall[0])
        video_f1_score_threshold_1.append(threshold_video_f1[1])
        video_precision_score_threshold_1.append(threshold_video_precision[1])
        video_recall_score_threshold_1.append(threshold_video_recall[1])
        video_auc_score_threshold.append(threshold_video_auc)

        print(f"Accuracy fold (frame-based): {accuracy}")
        print(f"Accuracy fold (video-based-mean): {mean_video_accuracy}")
        print(f"Accuracy fold (video-based-majority): {majority_video_accuracy}")
        print(f"Accuracy fold (video-based-threshold): {threshold_video_accuracy}")
    
    # Metrics average on all frames
    frame_avg_accuracy = np.mean(frame_accuracy_score)
    frame_avg_precision_0, frame_avg_recall_0, frame_avg_f1_0 = np.mean(frame_precision_score_0), np.mean(frame_recall_score_0), np.mean(frame_f1_score_0)
    frame_avg_precision_1, frame_avg_recall_1, frame_avg_f1_1 = np.mean(frame_precision_score_1), np.mean(frame_recall_score_1), np.mean(frame_f1_score_1)
    frame_avg_auc = np.mean(frame_auc_score)

    frame_metrics_dict["Mean Accuracy"].append(frame_avg_accuracy)
    frame_metrics_dict["Mean Precision (0)"].append(frame_avg_precision_0)
    frame_metrics_dict["Mean Precision (1)"].append(frame_avg_precision_1)
    frame_metrics_dict["Mean Recall (0)"].append(frame_avg_recall_0)
    frame_metrics_dict["Mean Recall (1)"].append(frame_avg_recall_1)
    frame_metrics_dict["Mean F1 (0)"].append(frame_avg_f1_0)
    frame_metrics_dict["Mean F1 (1)"].append(frame_avg_f1_1)
    frame_metrics_dict["Mean AUC"].append(frame_avg_auc)

    # Metrics standard deviation on all frames
    frame_std_accuracy = np.std(frame_accuracy_score)
    frame_std_precision_0, frame_std_recall_0, frame_std_f1_0 = np.std(frame_precision_score_0), np.std(frame_recall_score_0), np.std(frame_f1_score_0)
    frame_std_precision_1, frame_std_recall_1, frame_std_f1_1 = np.std(frame_precision_score_1), np.std(frame_recall_score_1), np.std(frame_f1_score_1)
    frame_std_auc = np.std(frame_auc_score)

    frame_metrics_dict["Std Accuracy"].append(frame_std_accuracy)
    frame_metrics_dict["Std Precision (0)"].append(frame_std_precision_0)
    frame_metrics_dict["Std Precision (1)"].append(frame_std_precision_1)
    frame_metrics_dict["Std Recall (0)"].append(frame_std_recall_0)
    frame_metrics_dict["Std Recall (1)"].append(frame_std_recall_1)
    frame_metrics_dict["Std F1 (0)"].append(frame_std_f1_0)
    frame_metrics_dict["Std F1 (1)"].append(frame_std_f1_1)
    frame_metrics_dict["Std AUC"].append(frame_std_auc)
        
    print("\n\nRESULTS on FRAMES:\n")

    print("Mean Accuracy:", frame_avg_accuracy)
    print("Accuracy std:", frame_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_mean)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_mean_0), np.mean(video_recall_score_mean_0), np.mean(video_f1_score_mean_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_mean_1), np.mean(video_recall_score_mean_1), np.mean(video_f1_score_mean_1)
    video_avg_auc = np.mean(video_auc_score_mean)

    video_metrics_dict_mean["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_mean["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_mean["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_mean["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_mean["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_mean["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_mean["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_mean["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_mean)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_mean_0), np.std(video_recall_score_mean_0), np.std(video_f1_score_mean_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_mean_1), np.std(video_recall_score_mean_1), np.std(video_f1_score_mean_1)
    video_std_auc = np.std(video_auc_score_mean)

    video_metrics_dict_mean["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_mean["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_mean["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_mean["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_mean["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_mean["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_mean["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_mean["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MEAN):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_majority)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_majority_0), np.mean(video_recall_score_majority_0), np.mean(video_f1_score_majority_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_majority_1), np.mean(video_recall_score_majority_1), np.mean(video_f1_score_majority_1)
    video_avg_auc = np.mean(video_auc_score_majority)

    video_metrics_dict_majority["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_majority["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_majority["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_majority["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_majority["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_majority["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_majority["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_majority["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_majority)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_majority_0), np.std(video_recall_score_majority_0), np.std(video_f1_score_majority_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_majority_1), np.std(video_recall_score_majority_1), np.std(video_f1_score_majority_1)
    video_std_auc = np.std(video_auc_score_majority)

    video_metrics_dict_majority["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_majority["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_majority["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_majority["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_majority["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_majority["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_majority["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_majority["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (MAJORITY):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

    # Metrics average on all videos
    video_avg_accuracy = np.mean(video_accuracy_score_threshold)
    video_avg_precision_0, video_avg_recall_0, video_avg_f1_0 = np.mean(video_precision_score_threshold_0), np.mean(video_recall_score_threshold_0), np.mean(video_f1_score_threshold_0)
    video_avg_precision_1, video_avg_recall_1, video_avg_f1_1 = np.mean(video_precision_score_threshold_1), np.mean(video_recall_score_threshold_1), np.mean(video_f1_score_threshold_1)
    video_avg_auc = np.mean(video_auc_score_threshold)

    video_metrics_dict_threshold["Mean Accuracy"].append(video_avg_accuracy)
    video_metrics_dict_threshold["Mean Precision (0)"].append(video_avg_precision_0)
    video_metrics_dict_threshold["Mean Precision (1)"].append(video_avg_precision_1)
    video_metrics_dict_threshold["Mean Recall (0)"].append(video_avg_recall_0)
    video_metrics_dict_threshold["Mean Recall (1)"].append(video_avg_recall_1)
    video_metrics_dict_threshold["Mean F1 (0)"].append(video_avg_f1_0)
    video_metrics_dict_threshold["Mean F1 (1)"].append(video_avg_f1_1)
    video_metrics_dict_threshold["Mean AUC"].append(video_avg_auc)

    # Metrics standard deviation on all videos
    video_std_accuracy = np.std(video_accuracy_score_threshold)
    video_std_precision_0, video_std_recall_0, video_std_f1_0 = np.std(video_precision_score_threshold_0), np.std(video_recall_score_threshold_0), np.std(video_f1_score_threshold_0)
    video_std_precision_1, video_std_recall_1, video_std_f1_1 = np.std(video_precision_score_threshold_1), np.std(video_recall_score_threshold_1), np.std(video_f1_score_threshold_1)
    video_std_auc = np.std(video_auc_score_threshold)

    video_metrics_dict_threshold["Std Accuracy"].append(video_std_accuracy)
    video_metrics_dict_threshold["Std Precision (0)"].append(video_std_precision_0)
    video_metrics_dict_threshold["Std Precision (1)"].append(video_std_precision_1)
    video_metrics_dict_threshold["Std Recall (0)"].append(video_std_recall_0)
    video_metrics_dict_threshold["Std Recall (1)"].append(video_std_recall_1)
    video_metrics_dict_threshold["Std F1 (0)"].append(video_std_f1_0)
    video_metrics_dict_threshold["Std F1 (1)"].append(video_std_f1_1)
    video_metrics_dict_threshold["Std AUC"].append(video_std_auc)
    
    print("\n\nRESULTS on VIDEOS (THRESHOLD):\n")
    
    print("Mean Accuracy:", video_avg_accuracy)
    print("Accuracy std:", video_std_accuracy)

In [None]:
cross_validation_video_audio_combined_2(frame_metrics_cross, video_metrics_cross_mean,
                                            video_metrics_cross_majority, video_metrics_cross_threshold,
                                         data_video_cross_early, data_video_cross_pad_early,
                                         data_audio_cross_early, data_audio_cross_pad_early,
                                         labels_video_cross_early, labels_video_cross_pad_early,
                                         names_video_cross_early, names_video_cross_pad_early,
                                         num_folds=4)

# Dataframe creation
df = pd.DataFrame.from_dict(frame_metrics_cross, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** FRAME LEVEL RESULTS *****\n")
print(df)
frame_metrics_cross = {key: [] for key in frame_metrics_cross}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_mean, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MEAN) *****\n")
print(df)
video_metrics_cross_mean = {key: [] for key in video_metrics_cross_mean}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_majority, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (MAJORITY) *****\n")
print(df)
video_metrics_cross_majority = {key: [] for key in video_metrics_cross_majority}

# Dataframe creation
df = pd.DataFrame.from_dict(video_metrics_cross_threshold, orient='index', columns=[f"EARLY FUSION {sequence_length}-{num_segments}"])
print("***** VIDEO LEVEL RESULTS (THRESHOLD) *****\n")
print(df)
video_metrics_cross_threshold = {key: [] for key in video_metrics_cross_threshold}