### Import packages

In [1]:
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob 
import os
import json
from matplotlib.colors import LogNorm

In [2]:
#root_folder_path = "N:\\durable\\sound-and-ecg\\2024-10-Maja-restructured\\Data"
root_folder_path = "C:\\Users\\MajaE\\src\\repos\\master_ML\\Data"

### Extract data

In [3]:
def extract_test_parameters(test_file):
    true,predictions,predictions_proba,test_recording_ids,segment_parameters =[],[],[],[],[]
    with open(test_file, 'rb') as handle:
        true,predictions,predictions_proba,test_recording_ids, segment_parameters = pickle.load(handle) 
    return true,predictions,predictions_proba,test_recording_ids,segment_parameters

### Analyze metrics

In [4]:
def calculate_metrics(true,predictions):
    accuracy = accuracy_score(true, predictions) # ((TN + TP) / (TN + FN + TP + FP))
    precision = precision_score(true, predictions,zero_division=np.nan) # Positive predictive rate -> TP/(TP+FP)
    recall = recall_score(true, predictions,zero_division=np.nan) # Sensitivity -> TP/(TP+FN)
    f1 = f1_score(true,predictions,zero_division=np.nan)
    specificity = 0
    return accuracy,recall,specificity,precision,f1

### Plot ROC curve

In [5]:
def find_avg_performance(folder_path):
    test_files = glob.glob(os.path.join(folder_path,'*.pickle'))
    true, predictions, predictions_proba, ids, segment_parameters = [], [], [], [], []
    for test_file in test_files:
        true_subject, predictions_subject,predictions_proba_subject,test_recording_ids,segment_parameters_subject = extract_test_parameters(test_file)
        true.extend(true_subject)
        predictions.extend(predictions_subject)
        predictions_proba.extend(predictions_proba_subject)
        ids.extend(test_recording_ids)
        segment_parameters.extend([segment_parameters_subject])
    accuracy_vector, specificity_vector, recall_vector, precision_vector, f1_vector = [], [], [], [], []
    for subject_idx in range(len(true)):
        accuracy,specificity,recall,precision,f1 = calculate_metrics(true[subject_idx],predictions[subject_idx])
        accuracy_vector.append(accuracy)
        specificity_vector.append(specificity)
        recall_vector.append(recall)
        precision_vector.append(precision)
        f1_vector.append(f1)
    accuracy = np.mean(accuracy_vector)
    specificity = np.mean(specificity_vector)
    recall = np.mean(recall_vector)
    precision = np.mean(precision_vector)
    f1 = np.mean(f1_vector)
    return accuracy, f1

In [6]:
def feature_selection_performance(root_folder_path):
    accuracy = []
    f1 = []
    data_folder_path = os.path.join(root_folder_path,"data")
    for iteration in os.listdir(data_folder_path):
        iteration_path = os.path.join(data_folder_path,iteration)
        i_accuracy, i_f1 = find_avg_performance(iteration_path)
        accuracy.append(i_accuracy)
        f1.append(i_f1)
    parameters_path = f"{root_folder_path}\\parameters.json"
    test_values = []
    with open(parameters_path, 'r') as file:
        meta_data = json.load(file)
        test_values = meta_data["test_values"]
    return accuracy, test_values


In [7]:
def feature_selection_curve(selection_algorithms):
    plt.figure()
    for selection_algorithm in selection_algorithms:
        performance, n_features = feature_selection_performance(selection_algorithm)
        plt.plot(n_features,performance,label=selection_algorithm)
    plt.xlabel('Number of selected features')
    plt.ylabel('Performance')
    plt.title('Performance of LDA')
    plt.legend()
    plt.grid(True)
    plt.show()

In [10]:
classes = ['No meal','Meal']
#selection_algorithms = ["mutual_information_LDA","RFE_LDA","sfs_LDA"]
selection_algorithms = ["mutual_information_HMM"]
feature_selection_curve(selection_algorithms)

ValueError: Found input variables with inconsistent numbers of samples: [3359, 3289]

<Figure size 640x480 with 0 Axes>