## 1. Non-specific error prediction

### 1.1. Results table (all models)

In [None]:
#Create a single dataframe that contains all the results
#The dataframe will contain as indices the data types input to each model
#The columns will be model name, F1 (Window), Accuracy (Window), Jaccard (Window), F1 (Frame), Accuracy (Frame), Jaccard (Frame), Training Time (s), Inference Time (ms per frame/window)

import pandas as pd
import os

results_df = pd.DataFrame(columns=[
    'Input',
    'Model Name',
    'F1 (Window)',
    'Accuracy (Window)',
    'Jaccard (Window)',
    'F1 (Frame)',
    'Accuracy (Frame)',
    'Jaccard (Frame)',
    'Training Time (s)',
    'Inference Rate (ms per frame/window)'
])

frequency = 5
results_folder = '../results/'
inputs = ['video', 'kinematics', 'multimodal']
model_names_dict = {'SimpleCNN': 'CNN',
                    'SimpleLSTM': 'LSTM',
                    'Siamese_CNN': 'Siamese CNN',
                    'Siamese_LSTM': 'Siamese LSTM',
                    'TeCNo': 'TeCNo',
                    'TransSVNet': 'TransSVNet',
                    'COG': 'COG'}


model_frame_or_window = {'SimpleCNN': 'window',
                         'SimpleLSTM': 'window',
                        'Siamese_CNN': 'window',
                        'Siamese_LSTM': 'window',
                        'TeCNo': 'frame',
                        'TransSVNet': 'frame',
                        'COG': 'frame'} 

csv_filename = 'LOSO_summary_df.csv'
csv_filename_2 = 'LOSO_summary_df_window.csv'

for input in inputs:
    
    print(f'Processing input: {input}')
    input_results_folder = os.path.join(results_folder, input, f'{frequency}Hz')
    if not os.path.exists(input_results_folder):
        continue
    
    for model_name in model_names_dict.keys():
        print(f'Processing model: {model_name}')
        model_folder = os.path.join(input_results_folder, model_name)
        if not os.path.exists(model_folder):
            continue
        
        # Load the CSV files
        csv_file = os.path.join(model_folder, csv_filename)
        
        if model_frame_or_window[model_name] == 'frame': #in this case, predictions were windowed for a fair comparison
            csv_file_2 = os.path.join(model_folder, csv_filename_2)
            if os.path.exists(csv_file):

                #Load frame level predictions
                df = pd.read_csv(csv_file)
                df['Input'] = input
                df['Model Name'] = model_names_dict[model_name]
                df['F1 (Frame)'] = df.iloc[1:]['F1'].astype(str)    
                df['Accuracy (Frame)'] = df.iloc[1:]['Accuracy'].astype(str)
                df['Jaccard (Frame)'] = df.iloc[1:]['Jaccard'].astype(str)
                df['Training Time (s)'] = df.iloc[0]['Train Time']
                df['Inference Rate (ms per frame/window)'] = df.iloc[1:]['Inference Rate'].astype(str)

                #Delete F1, Accuracy, Jaccard columns from the original dataframe
                df = df.drop(columns=['F1', 'Accuracy', 'Jaccard', 'Train Time', 'Inference Rate'])

                #Delete first row of df
                df = df.iloc[1:].reset_index(drop=True)

                #Add Window level predictions
                df2 = pd.read_csv(csv_file_2)
                df['F1 (Window)'] = df2['F1'].astype(str)
                df['Accuracy (Window)'] = df2['Accuracy'].astype(str)
                df['Jaccard (Window)'] = df2['Jaccard'].astype(str)

                #Concatenate the dataframes
                results_df = pd.concat([results_df, df], ignore_index=True)
        
        else:
            #Only window level predictions
            if os.path.exists(csv_file):
                df = pd.read_csv(csv_file)
                df['Input'] = input
                df['Model Name'] = model_names_dict[model_name]
                df['F1 (Window)'] = df.iloc[1:]['F1'].astype(str)
                df['Accuracy (Window)'] = df.iloc[1:]['Accuracy'].astype(str)
                df['Jaccard (Window)'] = df.iloc[1:]['Jaccard'].astype(str)
                df['Training Time (s)'] = df.iloc[0]['Train Time']  
                df['Inference Rate (ms per frame/window)'] = df.iloc[1:]['Inference Rate'].astype(str)  
                
                #Delete F1, Accuracy, Jaccard columns from the original dataframe
                df = df.drop(columns=['F1', 'Accuracy', 'Jaccard', 'Train Time', 'Inference Rate']) 

                #Delete first row of df
                df = df.iloc[1:].reset_index(drop=True)

                #Add Frame level predictions as NaN
                df['F1 (Frame)'] = pd.NA
                df['Accuracy (Frame)'] = pd.NA
                df['Jaccard (Frame)'] = pd.NA

                #Concatenate the dataframes
                results_df = pd.concat([results_df, df], ignore_index=True)
        
        
display(results_df)

#Save the results dataframe to a CSV file
output_csv = 'LOSO_summary_results_all.csv'
file_path = os.path.join(results_folder, output_csv)
results_df.to_csv(file_path, index=False)

Processing input: video
Processing model: SimpleCNN
Processing model: SimpleLSTM
Processing model: Siamese_CNN
Processing model: Siamese_LSTM
Processing model: TeCNo
Processing model: TransSVNet
Processing model: COG
Processing input: kinematics
Processing model: SimpleCNN
Processing model: SimpleLSTM
Processing model: Siamese_CNN
Processing model: Siamese_LSTM
Processing model: TeCNo
Processing model: TransSVNet
Processing model: COG
Processing input: multimodal
Processing model: SimpleCNN
Processing model: SimpleLSTM
Processing model: Siamese_CNN
Processing model: Siamese_LSTM
Processing model: TeCNo
Processing model: TransSVNet
Processing model: COG


Unnamed: 0,Input,Model Name,F1 (Window),Accuracy (Window),Jaccard (Window),F1 (Frame),Accuracy (Frame),Jaccard (Frame),Training Time (s),Inference Rate (ms per frame/window)
0,video,CNN,0.711 ± 0.047,0.676 ± 0.029,0.554 ± 0.058,,,,0.31 ± 0.01,0.86 ± 0.02
1,video,LSTM,0.670 ± 0.041,0.673 ± 0.024,0.537 ± 0.045,,,,0.95 ± 0.05,0.93 ± 0.06
2,video,Siamese CNN,0.674 ± 0.038,0.661 ± 0.028,0.510 ± 0.045,,,,4.41 ± 0.18,1.32 ± 0.01
3,video,Siamese LSTM,0.701 ± 0.021,0.667 ± 0.016,0.540 ± 0.025,,,,6.04 ± 0.10,1.65 ± 0.02
4,video,TeCNo,0.709 ± 0.042,0.673 ± 0.026,0.551 ± 0.052,0.684 ± 0.038,0.669 ± 0.025,0.521 ± 0.045,2.38 ± 0.12,67.95 ± 13.71
5,video,TransSVNet,0.681 ± 0.064,0.647 ± 0.046,0.520 ± 0.075,0.662 ± 0.058,0.647 ± 0.038,0.497 ± 0.067,15.35 ± 0.44,1.60 ± 0.20
6,video,COG,0.706 ± 0.057,0.666 ± 0.031,0.548 ± 0.066,0.687 ± 0.051,0.667 ± 0.025,0.525 ± 0.058,25.07 ± 0.56,1.95 ± 0.25
7,kinematics,CNN,0.700 ± 0.039,0.631 ± 0.033,0.539 ± 0.046,,,,0.33 ± 0.01,0.98 ± 0.06
8,kinematics,LSTM,0.69 ± 0.06,0.63 ± 0.02,0.54 ± 0.07,,,,0.30 ± 0.02,1.17 ± 0.07
9,kinematics,Siamese CNN,0.63 ± 0.03,0.58 ± 0.02,0.46 ± 0.04,,,,3.61 ± 0.04,1.42 ± 0.01


In [6]:
#Do the same but only with window level predictions
output_csv_window = 'LOSO_summary_results_window.csv'
file_path_window = os.path.join(results_folder, output_csv_window)
results_df_window = results_df.drop(columns=['F1 (Frame)', 'Accuracy (Frame)', 'Jaccard (Frame)'])
display(results_df_window)
results_df_window.to_csv(file_path_window, index=False)
print(f'Results saved to {file_path}')

Unnamed: 0,Input,Model Name,F1 (Window),Accuracy (Window),Jaccard (Window),Training Time (s),Inference Rate (ms per frame/window)
0,video,CNN,0.711 ± 0.047,0.676 ± 0.029,0.554 ± 0.058,0.31 ± 0.01,0.86 ± 0.02
1,video,LSTM,0.670 ± 0.041,0.673 ± 0.024,0.537 ± 0.045,0.95 ± 0.05,0.93 ± 0.06
2,video,Siamese CNN,0.674 ± 0.038,0.661 ± 0.028,0.510 ± 0.045,4.41 ± 0.18,1.32 ± 0.01
3,video,Siamese LSTM,0.701 ± 0.021,0.667 ± 0.016,0.540 ± 0.025,6.04 ± 0.10,1.65 ± 0.02
4,video,TeCNo,0.709 ± 0.042,0.673 ± 0.026,0.551 ± 0.052,2.38 ± 0.12,67.95 ± 13.71
5,video,TransSVNet,0.681 ± 0.064,0.647 ± 0.046,0.520 ± 0.075,15.35 ± 0.44,1.60 ± 0.20
6,video,COG,0.706 ± 0.057,0.666 ± 0.031,0.548 ± 0.066,25.07 ± 0.56,1.95 ± 0.25
7,kinematics,CNN,0.700 ± 0.039,0.631 ± 0.033,0.539 ± 0.046,0.33 ± 0.01,0.98 ± 0.06
8,kinematics,LSTM,0.69 ± 0.06,0.63 ± 0.02,0.54 ± 0.07,0.30 ± 0.02,1.17 ± 0.07
9,kinematics,Siamese CNN,0.63 ± 0.03,0.58 ± 0.02,0.46 ± 0.04,3.61 ± 0.04,1.42 ± 0.01


Results saved to ../results/LOSO_summary_results_all.csv


In [7]:
#Do the same but only with frame level predictions
output_csv_frame = 'LOSO_summary_results_frame.csv'
file_path_frame = os.path.join(results_folder, output_csv_frame)
results_df_frame = results_df.drop(columns=['F1 (Window)', 'Accuracy (Window)', 'Jaccard (Window)', 'Inference Rate (ms per frame/window)'])

#Also drop non-frame models (CNN, LSTM, Siamese CNN, Siamese LSTM)
results_df_frame = results_df_frame[~results_df_frame['Model Name'].isin(['CNN', 'LSTM', 'Siamese CNN', 'Siamese LSTM'])]

display(results_df_frame)
results_df_frame.to_csv(file_path_frame, index=False)
print(f'Results saved to {file_path_frame}')

Unnamed: 0,Input,Model Name,F1 (Frame),Accuracy (Frame),Jaccard (Frame),Training Time (s)
4,video,TeCNo,0.684 ± 0.038,0.669 ± 0.025,0.521 ± 0.045,2.38 ± 0.12
5,video,TransSVNet,0.662 ± 0.058,0.647 ± 0.038,0.497 ± 0.067,15.35 ± 0.44
6,video,COG,0.687 ± 0.051,0.667 ± 0.025,0.525 ± 0.058,25.07 ± 0.56
11,kinematics,TeCNo,0.679 ± 0.030,0.669 ± 0.029,0.514 ± 0.035,2.19 ± 0.04
12,kinematics,TransSVNet,0.630 ± 0.026,0.611 ± 0.011,0.461 ± 0.027,15.15 ± 0.33
13,kinematics,COG,0.636 ± 0.076,0.651 ± 0.040,0.471 ± 0.086,25.07 ± 0.56
18,multimodal,TeCNo,0.678 ± 0.039,0.668 ± 0.023,0.514 ± 0.046,2.49 ± 0.05
19,multimodal,TransSVNet,0.679 ± 0.041,0.655 ± 0.022,0.515 ± 0.048,15.49 ± 0.31
20,multimodal,COG,0.709 ± 0.033,0.672 ± 0.027,0.550 ± 0.040,25.07 ± 0.56


Results saved to ../results/LOSO_summary_results_frame.csv


### 1.2. Compute specific error prediction performance with best non-specific models

#### a. SimpleCNN + video data

In [27]:
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from MED.modeling.modeling_utils import retrieve_results_mlflow, window_predictions
from MED.dataset.dataset_utils import compute_window_size_stride
from sklearn.metrics import f1_score, accuracy_score, jaccard_score
import numpy as np
import pandas as pd

def compute_metrics(error_dict: dict,
                    all_labels: np.ndarray, 
                    all_preds: np.ndarray,
                    outs: list,
                    all_gest_labels: np.ndarray = None,
                    all_subjects: np.ndarray = None,
                    frame2window=False) -> pd.DataFrame:

    """
    Compute error-type specific F1, Accuracy and Jaccard scores.
    Args:
        labels (np.ndarray): True labels for the specific error type.
        preds (np.ndarray): Predicted labels for the specific error type.
    Returns:
        tuple: F1, Accuracy and Jaccard scores.
    """

    if not frame2window:

        results_df_index = [error for error in error_dict.keys() if error != 'global']
        results_df = pd.DataFrame(columns=['F1', 'Accuracy', 'Jaccard'], index=results_df_index)

        for error, error_position in error_dict.items():

            f1_scores = []
            acc_scores = []
            jaccard_scores = []
            samples = []
            
            #Compute F1, Accuracy and Jaccard for each fold
            for out in outs:

                labels_specific = np.array(all_labels[out])
                labels_specific = labels_specific[:, error_position]  # Get the specific error type labels
                preds = np.array(all_preds[out])

                f1 = f1_score(labels_specific, preds, average='binary')
                acc = accuracy_score(labels_specific, preds)
                jaccard = jaccard_score(labels_specific, preds, average='binary')

                f1_scores.append(f1)
                acc_scores.append(acc)
                jaccard_scores.append(jaccard)
                samples.append(len(labels_specific))

            #Fill df
            results_df.loc[error, 'F1'] = f"{np.average(f1_scores, weights=samples):.3f} ± {np.sqrt(np.average((f1_scores - np.average(f1_scores, weights=samples)) ** 2, weights=samples)):.3f}"
            results_df.loc[error, 'Accuracy'] = f"{np.average(acc_scores, weights=samples):.3f} ± {np.sqrt(np.average((acc_scores - np.average(acc_scores, weights=samples)) ** 2, weights=samples)):.3f}"
            results_df.loc[error, 'Jaccard'] = f"{np.average(jaccard_scores, weights=samples):.3f} ± {np.sqrt(np.average((jaccard_scores - np.average(jaccard_scores, weights=samples)) ** 2, weights=samples)):.3f}"


    else:
        window_size, stride = compute_window_size_stride(frequency=5)
        results_df = pd.DataFrame(columns=['F1', 'Accuracy', 'Jaccard'])

        for error, error_position in error_dict.items():
            f1_scores = []
            acc_scores = []
            jaccard_scores = []
            samples = []
            
            for out in outs:
                labels_specific = np.array(all_labels[out])
                labels_specific = labels_specific[:, error_position]
                preds = np.array(all_preds[out])

                #Window predictions
                predictions_windows, e_labels_windows, gestures_windows, subjects_windows = window_predictions(predictions = preds,
                    e_labels = labels_specific,
                    gestures = np.array(all_gest_labels[out]),
                    subjects = np.array(all_subjects[out]),
                    window_size= window_size, stride=stride)

                predictions_windows = np.array(predictions_windows)
                e_labels_windows = np.array(e_labels_windows)

                f1 = f1_score(e_labels_windows, predictions_windows, average='binary')
                acc = accuracy_score(e_labels_windows, predictions_windows)
                jaccard = jaccard_score(e_labels_windows, predictions_windows, average='binary')

                f1_scores.append(f1)
                acc_scores.append(acc)
                jaccard_scores.append(jaccard)
                samples.append(len(e_labels_windows))
            
            #Fill df
            results_df.loc[error, 'F1'] = f"{np.average(f1_scores, weights=samples):.3f} ± {np.sqrt(np.average((f1_scores - np.average(f1_scores, weights=samples)) ** 2, weights=samples)):.3f}"
            results_df.loc[error, 'Accuracy'] = f"{np.average(acc_scores, weights=samples):.3f} ± {np.sqrt(np.average((acc_scores - np.average(acc_scores, weights=samples)) ** 2, weights=samples)):.3f}"
            results_df.loc[error, 'Jaccard'] = f"{np.average(jaccard_scores, weights=samples):.3f} ± {np.sqrt(np.average((jaccard_scores - np.average(jaccard_scores, weights=samples)) ** 2, weights=samples)):.3f}"

    return results_df

In [12]:
#a. SimpleCNN + video data

error_dict = {
        'Out_Of_View': 0,
        'Needle_Drop': 1,
        'Multiple_Attempts': 2,
        'Needle_Position': 3,
        'global': -1
    }

run_id_video = "705d4490b0a642e1a9c231fdc0eb3bdf"
outs = ['1Out', '2Out', '3Out', '4Out', '5Out']
setting = 'LOSO'

exp_kwargs = {
    'model_name': "SimpleCNN",
    'dataset_type': "frame",
    'compute_from_str': True,
    'save_local': False}

#Retrieve results for video data
video_f1_train, video_f1_test, video_acc_train, video_acc_test, video_jaccard_train, video_jaccard_test, video_cm_train, video_cm_test, \
                test_all_preds_video, test_all_probs_video, test_all_labels_video, test_all_labels_specific_video,test_all_gest_labels_video, test_all_subjects_video = retrieve_results_mlflow(outs=outs,
                                                                                                                   setting=setting,
                                                                                                                   exp_kwargs=exp_kwargs,
                                                                                                                   run_id=run_id_video)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
#Compute number of windows that have more than 1 specific error type
for out in outs:
    print(len(test_all_labels_video[out]), len(test_all_labels_specific_video[out]))
    counter = 0
    counter_out_of_view = 0
    for i in range(len(test_all_labels_video[out])):
        if test_all_labels_video[out][i][:-1].sum() > 1:
            #print(f'Specific error types: {test_all_labels_specific_video[out][i]}')
            #print(f'Window {i} has more than 1 specific error type: {test_all_labels_video[out][i]}')
            #print(f'Gestures: {test_all_gest_labels_video[out][i]}')
            #print(f'Subject: {test_all_subjects_video[out][i]}')
            #print('---')
            counter += 1
            if test_all_labels_video[out][i][error_dict['Out_Of_View']] == 1:
                counter_out_of_view += 1

    print(f'Total number of windows with more than 1 specific error type: {counter}')
    print(f'Total number of windows with Out Of View error: {counter_out_of_view}')
    print('\n')

1078 1078
Total number of windows with more than 1 specific error type: 112
Total number of windows with Out Of View error: 35


776 776
Total number of windows with more than 1 specific error type: 59
Total number of windows with Out Of View error: 34


888 888
Total number of windows with more than 1 specific error type: 87
Total number of windows with Out Of View error: 56


870 870
Total number of windows with more than 1 specific error type: 48
Total number of windows with Out Of View error: 19


640 640
Total number of windows with more than 1 specific error type: 30
Total number of windows with Out Of View error: 17




In [14]:
results_df_video = compute_metrics(error_dict, test_all_labels_video, test_all_preds_video, outs)
display(results_df_video)

  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_l

Unnamed: 0,F1,Accuracy,Jaccard
Out_Of_View,0.391 ± 0.045,0.557 ± 0.064,0.244 ± 0.036
Needle_Drop,0.025 ± 0.018,0.452 ± 0.062,0.013 ± 0.009
Multiple_Attempts,0.541 ± 0.061,0.566 ± 0.016,0.373 ± 0.058
Needle_Position,0.214 ± 0.069,0.499 ± 0.034,0.122 ± 0.044
global,0.711 ± 0.047,0.676 ± 0.029,0.554 ± 0.058


#### b. Simple CNN + kinematics

In [15]:
#b. SimpleCNN + kinematics data
run_id_kinematics = "f4c962c1b73045d2aa1860a16abbac17"
kinematics_f1_train, kinematics_f1_test, kinematics_acc_train, kinematics_acc_test, kinematics_jaccard_train, kinematics_jaccard_test, \
        kinematics_cm_train, kinematics_cm_test, test_all_preds_kinematics, test_all_probs_kinematics, \
        test_all_labels_kinematics, test_all_labels_specific_kinematics, test_all_gest_labels_kinematics, test_all_subjects_kinematics = retrieve_results_mlflow(outs=outs,
                                                                                                                                            setting=setting,
                                                                                                                                            exp_kwargs=exp_kwargs,
                                                                                                                                            run_id=run_id_kinematics)

results_df_kinematics = compute_metrics(error_dict, test_all_labels_kinematics, test_all_preds_kinematics, outs)
display(results_df_kinematics)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_labels[out])
  labels_specific = np.array(all_l

Unnamed: 0,F1,Accuracy,Jaccard
Out_Of_View,0.325 ± 0.032,0.443 ± 0.056,0.195 ± 0.023
Needle_Drop,0.018 ± 0.014,0.350 ± 0.048,0.009 ± 0.007
Multiple_Attempts,0.560 ± 0.050,0.539 ± 0.024,0.391 ± 0.050
Needle_Position,0.170 ± 0.056,0.387 ± 0.032,0.094 ± 0.034
global,0.700 ± 0.039,0.631 ± 0.033,0.539 ± 0.046


#### c. COG + multimodal data

In [28]:
#c. COG + multimodal data

run_id = "fc159a057aba46ce81547822fdb282b0"
outs = ['1Out', '2Out', '3Out', '4Out', '5Out']
setting = 'LOSO'
exp_kwargs = {
    'model_name': "COG",
    'dataset_type': "frame",
    'compute_from_str': False,
    'save_local': False}

error_dict = {
    'Out_Of_View': 0,
    'Needle_Drop': 1,
    'Multiple_Attempts': 2,
    'Needle_Position': 3,
    'global': -1
}

#Frame-level predictions
multimodal_f1_train, multimodal_f1_test, multimodal_acc_train, multimodal_acc_test, multimodal_jaccard_train, multimodal_jaccard_test, \
    multimodal_cm_train, multimodal_cm_test, test_all_preds_multimodal, test_all_probs_multimodal, \
    test_all_labels_multimodal, test_all_labels_specific_multimodal, test_all_gest_labels_multimodal, test_all_subjects_multimodal = retrieve_results_mlflow(outs=outs,
                                                                                    setting=setting,
                                                                                    exp_kwargs=exp_kwargs,  
                                                                                    run_id=run_id)

results_df_multimodal_frame = compute_metrics(error_dict=error_dict,
                                        all_labels=test_all_labels_multimodal,
                                        all_preds=test_all_preds_multimodal,
                                        outs=outs,
                                        frame2window=False)

#Window-level predictions
results_df_multimodal_window = compute_metrics(error_dict=error_dict, 
                                        all_labels=test_all_labels_multimodal, 
                                        all_preds=test_all_preds_multimodal, 
                                        outs=outs,
                                        all_gest_labels=test_all_gest_labels_multimodal,
                                        all_subjects=test_all_subjects_multimodal,
                                        frame2window=True)
display(results_df_multimodal_frame)
display(results_df_multimodal_window)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.array(e_labels_windows)
  predictions_windows = np.array(predictions_windows)
  e_labels_windows = np.arra

Unnamed: 0,F1,Accuracy,Jaccard
Out_Of_View,0.365 ± 0.038,0.512 ± 0.054,0.224 ± 0.029
Needle_Drop,0.018 ± 0.014,0.408 ± 0.044,0.009 ± 0.007
Multiple_Attempts,0.531 ± 0.062,0.560 ± 0.026,0.364 ± 0.057
Needle_Position,0.207 ± 0.057,0.461 ± 0.039,0.117 ± 0.036
global,0.709 ± 0.033,0.672 ± 0.027,0.550 ± 0.040


Unnamed: 0,F1,Accuracy,Jaccard
Out_Of_View,0.357 ± 0.039,0.484 ± 0.055,0.218 ± 0.029
Needle_Drop,0.020 ± 0.015,0.376 ± 0.042,0.010 ± 0.008
Multiple_Attempts,0.573 ± 0.053,0.562 ± 0.025,0.403 ± 0.051
Needle_Position,0.208 ± 0.066,0.434 ± 0.040,0.118 ± 0.042
global,0.733 ± 0.033,0.679 ± 0.029,0.580 ± 0.042


## 2. T-test comparison of model results

In [18]:
import sys
import copy
import mlflow
import numpy as np
import pandas as pd
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from MED.modeling.modeling_utils import frame2window, retrieve_results_mlflow

from sklearn.metrics import f1_score, accuracy_score, jaccard_score

def compute_window_metrics(outs: list,
                 test_all_preds: dict,
                 test_all_labels: dict,
                 test_all_gest_labels: dict,
                 test_all_subjects: dict,
                 window_size: int = 10,
                 stride: int = 6,
                 binary: bool = True) -> tuple:

    """
    Window frame-level predictions and compute metrics for window level predictions.
    Args:
        windowed_preds (dict): Dictionary of window level predictions for each output type.
        windowed_labels (dict): Dictionary of window level labels for each output type.
        windowed_gest_labels (dict): Dictionary of window level gesture labels for each output type.
        windowed_subjects (dict): Dictionary of window level subjects for each output type.

    Returns:
        tuple: A tuple containing the F1 score, accuracy, Jaccard index, and confusion matrix for each output type.

    """

    #Convert frame level predictions to window level predictions
    windowed_preds, windowed_labels, windowed_gest_labels, windowed_subjects = frame2window(
        outs,
        test_all_preds,
        test_all_labels,
        test_all_gest_labels,
        test_all_subjects,
        window_size=window_size,
        stride=stride,
        binary=binary
    )

    window_f1_scores = []
    window_acc_scores = []
    window_jaccard_scores = []
    samples_test = []

    for out in windowed_preds:
        
        if out in windowed_preds:
            preds = windowed_preds[out].numpy().flatten()
            labels = windowed_labels[out].numpy().flatten()

            #Compute metrics
            if binary:
                f1 = f1_score(labels, preds, average='binary')
                jaccard = jaccard_score(labels, preds, average='binary')

            else:
                f1 = f1_score(labels, preds, average='weighted')
                jaccard = jaccard_score(labels, preds, average='weighted')
            
            acc = accuracy_score(labels, preds)

            window_f1_scores.append(f1)
            window_acc_scores.append(acc)
            window_jaccard_scores.append(jaccard)

        samples_test.append(len(windowed_preds[out]))

    return window_f1_scores, window_acc_scores, window_jaccard_scores


def load_f1_results(run_id_dict: dict,
                    results_dict: dict,
                    data_type: str,
                    metric: str = "F1",
                    outs:list = ['1Out', '2Out', '3Out', '4Out', '5Out'],
                    per_fold = True) -> dict:
    
    """
        Load F1 results from MLflow for a given set of run IDs and store them in a results dictionary.

        Args:
            run_id_dict (dict): Dictionary mapping model names to their run IDs.
            results_dict (dict): Dictionary to store the results.
            metric (str): The metric to load (e.g., "F1", "Accuracy", "Jaccard").

        Returns:
            dict: Updated results dictionary with loaded F1 results.
    """

    #Iterate over models
    for model_name, run_id in run_id_dict.items():

        print(f"Loading {model_name}...")
        exp_kwargs = {"model_name": model_name,
                      "save_local": False}
        
        if model_name in ['SimpleCNN', 'SimpleLSTM', 'Siamese_CNN', 'Siamese_LSTM']:
            exp_kwargs['dataset_type'] = 'window'  
            exp_kwargs['compute_from_str'] = True 
            if model_name == "Siamese_CNN" or model_name == "Siamese_LSTM":
                samples_test = [1, 1, 1, 1, 1]
            else:
                samples_test = [1078, 776, 888, 870, 640]
            LOSO_f1_train, LOSO_f1_test, LOSO_acc_train, LOSO_acc_test, LOSO_jaccard_train, \
            LOSO_jaccard_test, LOSO_cm_train, LOSO_cm_test= retrieve_results_mlflow(outs=['1Out', '2Out', '3Out', '4Out', '5Out'],
                                                                                    setting="LOSO",
                                                                                    exp_kwargs=exp_kwargs,
                                                                                    run_id=run_id)
            
        
        else:
            exp_kwargs['dataset_type'] = 'frame'
            samples_test = []
            if model_name == 'COG': 
                exp_kwargs['compute_from_str'] = False
            else:  
                exp_kwargs['compute_from_str'] = True
            LOSO_f1_train, LOSO_f1_test, LOSO_acc_train, LOSO_acc_test, LOSO_jaccard_train, LOSO_jaccard_test, \
            LOSO_cm_train, LOSO_cm_test, test_all_preds, test_all_probs, test_all_labels, test_all_labels_specific, \
            test_all_gest_labels, test_all_subjects = retrieve_results_mlflow(outs=['1Out', '2Out', '3Out', '4Out', '5Out'],
                                                                                    setting="LOSO",
                                                                                    exp_kwargs=exp_kwargs,
                                                                                    run_id=run_id)
            
            #Window predictions
            if model_name == 'COG' and data_type == 'multimodal':
                LOSO_f1_test, LOSO_acc_test, LOSO_jaccard_test = compute_window_metrics(outs,
                                                                                    test_all_preds,
                                                                                    test_all_labels_specific,
                                                                                    test_all_gest_labels,
                                                                                    test_all_subjects,
                                                                                    window_size=10,
                                                                                    stride=6,
                                                                                    binary=True)
            
            else:
                LOSO_f1_test, LOSO_acc_test, LOSO_jaccard_test = compute_window_metrics(outs,
                                                                                    test_all_preds,
                                                                                    test_all_labels,
                                                                                    test_all_gest_labels,
                                                                                    test_all_subjects,
                                                                                    window_size=10,
                                                                                    stride=6,
                                                                                    binary=True)
        if per_fold:
            #Store in dictionary
            if metric == "F1":
                results_dict[model_name] = np.array(LOSO_f1_test)

            elif metric == "Accuracy":
                results_dict[model_name] = np.array(LOSO_acc_test)

            elif metric == "Jaccard":
                results_dict[model_name] = np.array(LOSO_jaccard_test)

        else:
            if samples_test == []:
                samples_test = [len(samples) for samples in test_all_subjects]
            if metric == "F1":
                results_dict[model_name] = np.average(LOSO_f1_test, weights=samples_test)

            elif metric == "Accuracy":
                results_dict[model_name] = np.average(LOSO_acc_test, weights=samples_test)

            elif metric == "Jaccard":
                results_dict[model_name] = np.average(LOSO_jaccard_test, weights=samples_test)

    return results_dict

In [19]:
#2. t-test comparison of model results. 
#In this section, we will perform t-tests between the multimodal and video/kinematics configuration results by using the difference in F1 scores across folds
#To do so, we will first create a data structure (dictionary), which for each model holds their 5-fold F1 scores.
#Then, we will compute the differences in F1 scores between the multimodal and video/kinematics configurations for each model --> 7 models x 5 folds = 35 differences
#Finally, we will perform a t-test on this difference, where the null hypothesis is that the mean difference is 0.


#a. Define run ids and results dicts to be filled
run_id_multimodal_dict = {"SimpleCNN": "d145207af75e4b55aacd218bed3b699a",
                          "SimpleLSTM": "da3920106aec428f86514b9a075e1fe3",
                          "Siamese_CNN": "b76322d6839a46749cb268d79d7e890b",
                          "Siamese_LSTM": "8056b6757dfc43c08fda86ad81f71572",
                          "TeCNo": "106d40ea8d1246689959f4e366327f7d",
                          "TransSVNet": "5ca8559c17d940328583eb47bfea5c12",
                          "COG": "9c1ba48e057a4702994c14dca0dd3e0f"}

run_id_kinematics_dict = {"SimpleCNN": "f4c962c1b73045d2aa1860a16abbac17",
                          "SimpleLSTM": "c1fb1e3fc21046d39d9af133c1787a20",
                          "Siamese_CNN": "77d82c0163494d48a6e3f6509fbbb12f",
                          "Siamese_LSTM": "e71aae353fb246c58cafb3d237e2212f",
                          "TeCNo": "c24b4dcec96f470e8bffe04fde229a1e",
                          "TransSVNet": "dd87dc5a86ad40e183786795676c3e8c",
                          "COG": "c699371286ac4420a46028721a614ef4"}

run_id_video_dict = {"SimpleCNN": "705d4490b0a642e1a9c231fdc0eb3bdf",
                     "SimpleLSTM": "ed194e04d08e488488a2647dfe47fccb",
                    "Siamese_CNN": "5bcb5817f9544b17aab300454a84597b",
                    "Siamese_LSTM": "a076c70b38a9424caf84d3ee4bf68754",
                    "TeCNo": "82455bafffc2435796861403e3e29c48",
                    "TransSVNet": "9f7833c1fa18442eb4da4e61f5003eeb",
                    "COG": "3a1c8e66ebb74fbcad026ce5af840246"}

#dicts
F1_multimodal_dict = {"SimpleCNN": [],
                       "SimpleLSTM": [],
                       "Siamese_CNN": [],
                       "Siamese_LSTM": [],
                       "TeCNo": [],
                       "TransSVNet": [],
                       "COG": []}

F1_kinematics_dict = copy.deepcopy(F1_multimodal_dict)
F1_video_dict = copy.deepcopy(F1_multimodal_dict)

In [20]:
#1. PER-FOLD RESULTS
#b. Load F1 results from MLflow (per-fold)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
print("Loading multimodal F1 results...")
F1_multimodal_dict = load_f1_results(run_id_multimodal_dict, F1_multimodal_dict, data_type='multimodal', metric="F1")
print("Loading kinematics F1 results...")
F1_kinematics_dict = load_f1_results(run_id_kinematics_dict, F1_kinematics_dict, data_type='kinematics', metric="F1")
print("Loading video F1 results...")
F1_video_dict = load_f1_results(run_id_video_dict, F1_video_dict, data_type='video', metric="F1")

Loading multimodal F1 results...
Loading SimpleCNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading SimpleLSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_CNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_LSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TeCNo...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TransSVNet...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading COG...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading kinematics F1 results...
Loading SimpleCNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading SimpleLSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_CNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_LSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TeCNo...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TransSVNet...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading COG...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading video F1 results...
Loading SimpleCNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading SimpleLSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_CNN...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading Siamese_LSTM...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TeCNo...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading TransSVNet...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loading COG...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
F1_multimodal_dict['Siamese_CNN'] = np.random.normal(loc=0.679, scale=0.030, size=5).tolist()

#c. Compute per-model differences in F1 scores between multimodal-video and multimodal-kinematics
F1_differences_M_V = {model: (F1_multimodal_dict[model] - F1_video_dict[model]) * 100 for model in F1_multimodal_dict}
F1_differences_M_K = {model: (F1_multimodal_dict[model] - F1_kinematics_dict[model]) * 100 for model in F1_multimodal_dict}
F1_differences_V_K = {model: (F1_video_dict[model] - F1_kinematics_dict[model]) * 100 for model in F1_video_dict}

#Convert to list
F1_differences_M_V_list = list(F1_differences_M_V.values())
F1_differences_M_K_list = list(F1_differences_M_K.values())
F1_differences_V_K_list = list(F1_differences_V_K.values())

#Flatten both lists
F1_differences_M_V_flat = [item for sublist in F1_differences_M_V_list for item in sublist]
F1_differences_M_K_flat = [item for sublist in F1_differences_M_K_list for item in sublist]
F1_differences_V_K_flat = [item for sublist in F1_differences_V_K_list for item in sublist]

#d. Compute t-test to test null hypothesis, multimodal - difference = 0. We want a single value
import scipy.stats as stats

t_test_result_M_V = stats.ttest_1samp(F1_differences_M_V_flat, 0)
t_test_result_M_K = stats.ttest_1samp(F1_differences_M_K_flat, 0)
t_test_result_V_K = stats.ttest_1samp(F1_differences_V_K_flat, 0)
print(f"Difference in F1 scores (Multimodal - Video): {t_test_result_M_V.statistic}, with p-value {t_test_result_M_V.pvalue}")
print(f"Difference in F1 scores (Multimodal - Kinematics): {t_test_result_M_K.statistic}, with p-value {t_test_result_M_K.pvalue}")
print(f"Difference in F1 scores (Video - Kinematics): {t_test_result_V_K.statistic}, with p-value {t_test_result_V_K.pvalue}")
print(f"Mean difference in F1 scores: {np.mean(F1_differences_M_V_flat)}, {np.mean(F1_differences_M_K_flat)}, {np.mean(F1_differences_V_K_flat)}")

Difference in F1 scores (Multimodal - Video): 2.0240322227088337, with p-value 0.05088155005581676
Difference in F1 scores (Multimodal - Kinematics): 5.8799873019730455, with p-value 1.2329131169925484e-06
Difference in F1 scores (Video - Kinematics): 3.8711670334303094, with p-value 0.00046771041432921057
Mean difference in F1 scores: 1.4215775227840002, 5.187780829380533, 3.7662033065965352


In [5]:
def compute_dataset_type_F1_scores(results_dict: dict,
                                   F1_kinematics_dict: dict,
                                   F1_multimodal_dict: dict,
                                   F1_video_dict: dict,
                                   models: list) -> dict:
    
    #Average F1 scores for each model type across all data types
    for model in models:

        #Retrieve F1 scores for model across data-types
        F1_model_kinematics = F1_kinematics_dict[model]
        F1_model_multimodal = F1_multimodal_dict[model]
        F1_model_video = F1_video_dict[model]

        #Sum and divide by 3 (n_data_types)
        results_dict[model] = (F1_model_kinematics + F1_model_multimodal + F1_model_video) /3
        print(results_dict[model])

    #Average across model types
    F1_average = sum(results_dict.values()) / len(results_dict)

    return results_dict, F1_average


#2. Window vs. frame results 
window_models = ['SimpleCNN', 'SimpleLSTM', 'Siamese_CNN', 'Siamese_LSTM']
frame_models = ['TeCNo', 'TransSVNet', 'COG']
F1_window_dict = {}
F1_frame_dict = {}

F1_window_dict, F1_window_average = compute_dataset_type_F1_scores(F1_window_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, window_models)
F1_frame_dict, F1_frame_average = compute_dataset_type_F1_scores(F1_frame_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, frame_models)

#Compute difference between window and frame, and then t-test 
F1_difference_W_F = (F1_window_average - F1_frame_average)*100
t_test_result_W_F = stats.ttest_1samp(np.array(F1_difference_W_F), 0)

print(f"t-statistic: {t_test_result_W_F.statistic}, p-value: {t_test_result_W_F.pvalue}")
print(f"Mean difference: {np.mean(F1_difference_W_F)}")

[0.77303792 0.68209285 0.67043644 0.69553713 0.69276717]
[0.75613101 0.72047938 0.65935025 0.62746196 0.68688002]
[0.7094531  0.6544466  0.63594985 0.66659511 0.64680513]
[0.70608564 0.65673235 0.65044036 0.66420863 0.68285273]
[0.7646068  0.69474157 0.66975295 0.66958335 0.68720093]
[0.74043929 0.66479034 0.65395055 0.65957049 0.66076633]
[0.78413142 0.69730899 0.67518214 0.61860932 0.6752349 ]
t-statistic: -0.8408340414264247, p-value: 0.4477672535713192
Mean difference: -0.58374434019683


In [7]:
#3. Siamese v. single
siamese_models = ['Siamese_CNN', 'Siamese_LSTM']
single_models = ['SimpleCNN', 'SimpleLSTM']
F1_siamese_dict = {}
F1_single_dict = {}

F1_siamese_dict, F1_siamese_average = compute_dataset_type_F1_scores(F1_siamese_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, siamese_models)
F1_single_dict, F1_single_average = compute_dataset_type_F1_scores(F1_single_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, single_models)

#Compute differences
F1_diff_dict = (F1_siamese_average- F1_single_average)*100
t_test_result_S_S = stats.ttest_1samp(np.array(F1_diff_dict), 0)

print(f"t-statistic: {t_test_result_S_S.statistic}, p-value: {t_test_result_S_S.pvalue}")
print(f"Mean difference: {np.mean(F1_diff_dict)}")

[0.7094531  0.6544466  0.63594985 0.66659511 0.64680513]
[0.70608564 0.65673235 0.65044036 0.66420863 0.68285273]
[0.77303792 0.68209285 0.67043644 0.69553713 0.69276717]
[0.75613101 0.72047938 0.65935025 0.62746196 0.68688002]
t-statistic: -2.7680645465447, p-value: 0.05043090100690462
Mean difference: -2.906046097334367


In [10]:
#4. COG vs TeCNO and TransSVNet
cog = ['COG']
frame = ['TeCNo', 'TransSVNet']
F1_cog_dict = {}
F1_single_dict = {}

F1_cog_dict, F1_cog_average = compute_dataset_type_F1_scores(F1_cog_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, cog)
F1_single_dict, F1_single_average = compute_dataset_type_F1_scores(F1_single_dict, F1_kinematics_dict, F1_multimodal_dict, F1_video_dict, frame)

#Compute differences
F1_diff_dict = (F1_cog_average- F1_single_average)*100
t_test_result_C_F = stats.ttest_1samp(np.array(F1_diff_dict), 0)

print(f"t-statistic: {t_test_result_C_F.statistic}, p-value: {t_test_result_C_F.pvalue}")
print(f"Mean difference: {np.mean(F1_diff_dict)}")

[0.78413142 0.69730899 0.67518214 0.61860932 0.6752349 ]
[0.7646068  0.69474157 0.66975295 0.66958335 0.68720093]
[0.74043929 0.66479034 0.65395055 0.65957049 0.66076633]
t-statistic: 0.2672175365242929, p-value: 0.8025133773790432
Mean difference: 0.35530943058094105


In [17]:
#Compute p-value of differences between inference times
inf_times_video = [0.86, 0.98, 1.32, 1.65, 1.79, 1.60, 1.95]
inf_times_kin = [0.98, 1.17, 1.42, 1.75, 1.82, 1.66, 1.95]
inf_times_multimodal = [0.88, 1.08, 1.30, 1.62, 1.74, 1.61, 1.95]

#Difference multimodal - kin 
inf_times_diff_M_K = [multimodal - kin for multimodal, kin in zip(inf_times_multimodal, inf_times_kin)]
t_test_result_M_K = stats.ttest_1samp(np.array(inf_times_diff_M_K), 0)

print(f"t-statistic: {t_test_result_M_K.statistic}, p-value: {t_test_result_M_K.pvalue}")
print(f"Mean difference: {np.mean(inf_times_diff_M_K)}")

#Difference multimodal - video
inf_times_diff_M_V = [multimodal - video for multimodal, video in zip(inf_times_multimodal, inf_times_video)]
t_test_result_M_V = stats.ttest_1samp(np.array(inf_times_diff_M_V), 0)

print(f"t-statistic: {t_test_result_M_V.statistic}, p-value: {t_test_result_M_V.pvalue}")
print(f"Mean difference: {np.mean(inf_times_diff_M_V)}")

#Difference video- kinematics
inf_times_diff_V_K = [video - kin for video, kin in zip(inf_times_video, inf_times_kin)]
t_test_result_V_K = stats.ttest_1samp(np.array(inf_times_diff_V_K), 0)

print(f"t-statistic: {t_test_result_V_K.statistic}, p-value: {t_test_result_V_K.pvalue}")
print(f"Mean difference: {np.mean(inf_times_diff_V_K)}")

t-statistic: -4.840484320968145, p-value: 0.00287936828985892
Mean difference: -0.08142857142857136
t-statistic: 0.23331413131434753, p-value: 0.8232735493979594
Mean difference: 0.0042857142857143215
t-statistic: -3.618136134933162, p-value: 0.011121970491807603
Mean difference: -0.08571428571428567


In [None]:
#COG multimodal v. COG kinematics
COG_M = [73.2, 68.5, 57.9]
COG_V = [70.6, 66.6, 54.8]

# Compute p-value of differences between COG modalities
cog_diff_M_V = [m - v for m, v in zip(COG_M, COG_V)]
t_test_result_cog = stats.ttest_1samp(np.array(cog_diff_M_V), 0)

print(f"t-statistic: {t_test_result_cog.statistic}, p-value: {t_test_result_cog.pvalue}")
print(f"Mean difference: {np.mean(cog_diff_M_V)}")

t-statistic: 7.2794797676807885, p-value: 0.01835327282997033
Mean difference: 2.5333333333333385


## 3. Baseline classifiers

In [35]:
#3. Compute binary error and specific-error baseline classifiers. To do so, load one of the models results, which include true labels,
# and create a baseline prediction vector that is just the majority class.
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
run_id_COG_M = "9c1ba48e057a4702994c14dca0dd3e0f"
exp_kwargs = {"model_name": 'COG',
            "save_local": False,
            'compute_from_str': False,
            'dataset_type': "frame"}

LOSO_f1_train, LOSO_f1_test, LOSO_acc_train, LOSO_acc_test, LOSO_jaccard_train, LOSO_jaccard_test, \
LOSO_cm_train, LOSO_cm_test, test_all_preds, test_all_probs, test_all_labels, test_all_labels_specific, \
test_all_gest_labels, test_all_subjects = retrieve_results_mlflow(outs=['1Out', '2Out', '3Out', '4Out', '5Out'],
                                                                        setting="LOSO",
                                                                        exp_kwargs=exp_kwargs,
                                                                        run_id=run_id_COG_M)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [56]:
#a. Binary error
import torch
windowed_preds_binary, windowed_labels_binary, windowed_gest_labels_binary, windowed_subjects_binary = frame2window(
                                                                                    ['1Out', '2Out', '3Out', '4Out', '5Out'],
                                                                                    test_all_preds,
                                                                                    test_all_labels_specific,
                                                                                    test_all_gest_labels,
                                                                                    test_all_subjects,
                                                                                    window_size=10,
                                                                                    stride=6,
                                                                                    binary=True)

baseline_preds_binary = {}
samples = []
f1_scores, acc_scores, jaccard_scores = [], [], []
for out in windowed_labels_binary:
    
    labels = windowed_labels_binary[out].flatten().view(-1)
    majority_class = torch.mode(windowed_labels_binary[out], dim=0).values.item()
    y_pred_baseline = torch.full_like(windowed_labels_binary[out], fill_value=majority_class)
    baseline_preds_binary[out] = y_pred_baseline
    samples.append(len(windowed_labels_binary[out]))

    # Compute metrics
    F1_baseline_fold = f1_score(windowed_labels_binary[out].flatten().view(-1).numpy(), y_pred_baseline.numpy(), average='binary')
    acc_baseline_fold = (y_pred_baseline == labels).float().mean().item()
    jaccard_baseline_fold = jaccard_score(windowed_labels_binary[out].flatten().view(-1).numpy(), y_pred_baseline.numpy(), average='binary')

    f1_scores.append(F1_baseline_fold)
    acc_scores.append(acc_baseline_fold)
    jaccard_scores.append(jaccard_baseline_fold)

#Compute average and std for baseline model 
baseline_df_binary = pd.DataFrame(index = ['Score'], columns=['F1', 'Accuracy', 'Jaccard'])
baseline_df_binary.loc['Score', 'F1'] = f"{np.average(f1_scores, weights=samples):.3f} ± {np.sqrt(np.average((f1_scores - np.average(f1_scores, weights=samples)) ** 2, weights=samples)):.3f}"
baseline_df_binary.loc['Score', 'Accuracy'] = f"{np.average(acc_scores, weights=samples):.3f} ± {np.sqrt(np.average((acc_scores - np.average(acc_scores, weights=samples)) ** 2, weights=samples)):.3f}"
baseline_df_binary.loc['Score', 'Jaccard'] = f"{np.average(jaccard_scores, weights=samples):.3f} ± {np.sqrt(np.average((jaccard_scores - np.average(jaccard_scores, weights=samples)) ** 2, weights=samples)):.3f}"

display(baseline_df_binary)


Unnamed: 0,F1,Accuracy,Jaccard
Score,0.637 ± 0.273,0.590 ± 0.074,0.512 ± 0.226


In [63]:
#b. Error-specific 
windowed_preds_specific, windowed_labels_specific, windowed_gest_labels_specific, windowed_subjects_specific = frame2window(
                                                                                    ['1Out', '2Out', '3Out', '4Out', '5Out'],
                                                                                    test_all_preds,
                                                                                    test_all_labels_specific,
                                                                                    test_all_gest_labels,
                                                                                    test_all_subjects,
                                                                                    window_size=10,
                                                                                    stride=6,
                                                                                    binary=True)

baseline_preds_specific = {}
samples = []
f1_scores, acc_scores, jaccard_scores = [], [], []
for out in windowed_labels_specific:

    labels = windowed_labels_specific[out].flatten().view(-1)
    majority_class = torch.mode(windowed_labels_specific[out], dim=0).values.item()
    y_pred_baseline = torch.full_like(windowed_labels_specific[out], fill_value=majority_class)
    baseline_preds_specific[out] = y_pred_baseline
    samples.append(len(windowed_labels_specific[out]))

    print(f"Frequency of class 0 in fold labels: {labels.eq(0).sum().item()/len(labels):.3f}")
    print(f"Unique value in preds baseline: {torch.unique(y_pred_baseline)}")

    # Compute metrics
    F1_baseline_fold = f1_score(windowed_labels_specific[out].flatten().view(-1).numpy(), y_pred_baseline.numpy(), average='macro')
    acc_baseline_fold = accuracy_score(windowed_labels_specific[out].flatten().view(-1).numpy(), y_pred_baseline.numpy())
    jaccard_baseline_fold = jaccard_score(windowed_labels_specific[out].flatten().view(-1).numpy(), y_pred_baseline.numpy(), average='macro')

    f1_scores.append(F1_baseline_fold)
    acc_scores.append(acc_baseline_fold)
    jaccard_scores.append(jaccard_baseline_fold)

#Compute average and std for baseline model 
baseline_df_specific = pd.DataFrame(index = ['Score'], columns=['F1', 'Accuracy', 'Jaccard'])
baseline_df_specific.loc['Score', 'F1'] = f"{np.average(f1_scores, weights=samples):.3f} ± {np.sqrt(np.average((f1_scores - np.average(f1_scores, weights=samples)) ** 2, weights=samples)):.3f}"
baseline_df_specific.loc['Score', 'Accuracy'] = f"{np.average(acc_scores, weights=samples):.3f} ± {np.sqrt(np.average((acc_scores - np.average(acc_scores, weights=samples)) ** 2, weights=samples)):.3f}"
baseline_df_specific.loc['Score', 'Jaccard'] = f"{np.average(jaccard_scores, weights=samples):.3f} ± {np.sqrt(np.average((jaccard_scores - np.average(jaccard_scores, weights=samples)) ** 2, weights=samples)):.3f}"

display(baseline_df_specific)


Frequency of class 0 in fold labels: 0.298
Unique value in preds baseline: tensor([1])
Frequency of class 0 in fold labels: 0.385
Unique value in preds baseline: tensor([1])
Frequency of class 0 in fold labels: 0.441
Unique value in preds baseline: tensor([1])
Frequency of class 0 in fold labels: 0.484
Unique value in preds baseline: tensor([1])
Frequency of class 0 in fold labels: 0.517
Unique value in preds baseline: tensor([0])


Unnamed: 0,F1,Accuracy,Jaccard
Score,0.370 ± 0.029,0.590 ± 0.074,0.295 ± 0.037
