### Converting Origina Dataset 

In [26]:
import pandas as pd
import numpy as np
import h5py
from scipy.io import loadmat
from enum import Enum

In [27]:
def decode_utf16_array(array):
    """Decode MATLAB UTF-16 encoded uint16 arrays to strings."""
    array = array.flatten() if array.ndim == 2 else array
    return ''.join(chr(c) for c in array if c != 0)


def handle_cell_string_dataset(dataset, file_handle):
    """Handle MATLAB-style cell array of strings stored as references."""
    result = []
    for i in range(dataset.shape[0]):
        ref = dataset[i, 0]
        deref = file_handle[ref]
        value = deref[()]
        if isinstance(value, bytes):
            result.append(value.decode('utf-8'))
        elif isinstance(value, np.ndarray) and value.dtype == np.uint16:
            result.append(decode_utf16_array(value))
        else:
            result.append(value)
    return result

def iterate_group(data: h5py.Group, final_data: dict):
    """
    Recursively iterate through a h5py Group and print its structure.
    """
    for key in data.keys():
        item = data[key]
        if isinstance(item, h5py.Group):
            final_data[key] = {}
            iterate_group(item, final_data[key])
        else:
            final_data[key] = item[:]
    return final_data

def load_data_matfile(path: str, name: list[str]=None):
    data = None
    with h5py.File(path, 'r') as f:
        # List all groups in the file
        # print("Keys in the file:", list(f.keys()))
        # Access the dataset

        if len(name)==0:
            data = {}
            iterate_group(f, data)
            return data
        else:
            result = {}
            for key in name:
                if isinstance(f[key], h5py.Group):
                    # print("key is a group, iterating through it")
                    result[key] = iterate_group(f[key], {})
                elif key == 'FILE_ID':
                    result[key] = handle_cell_string_dataset(f[key], f)
                else:
                    # print("key is a dataset, returning data")
                    data = f[key][:]
                    data = data.T  # Transpose to match MATLAB's column-major order
                    result[key] = data 
            return result

In [28]:
class SourceDataKeys(Enum):
    """
    Enum to represent different keys in the original mat file.
    """
    FILE_ID = 'FILE_ID'
    ANALYSIS_ID = 'analysis_ID'
    ANALYSIS_SCORE = 'analysis_SCORE'
    SFNC = 'sFNC'

In [29]:
def convert_fnc_to_features(fnc_path):
    original_data = load_data_matfile(
        "../original_dataset/FBIRN.mat",
        name=[
            SourceDataKeys.SFNC.value,
            SourceDataKeys.FILE_ID.value,
            SourceDataKeys.ANALYSIS_SCORE.value,
        ],
    )
    
    # print(original_data[SourceDataKeys.FILE_ID.value])
    
    label_index=-1
    for index in range(len(original_data[SourceDataKeys.FILE_ID.value])):
        if "diagnosis" in original_data[SourceDataKeys.FILE_ID.value][index].lower():
            label_index = index
            break

    labels = original_data[SourceDataKeys.ANALYSIS_SCORE.value][:,label_index]
    labels = np.reshape(labels, (len(labels), 1))
    fnc_matrices = original_data[SourceDataKeys.SFNC.value]

    features = fnc_matrices.shape[1] * (fnc_matrices.shape[1] - 1) // 2
    subjects_count = fnc_matrices.shape[0]

    source_data = np.zeros((subjects_count, features), dtype=np.float32)

    for i in range(subjects_count):
        fnc_matrix: np.ndarray = fnc_matrices[i]
        # lower triangle matrix without the diagonal
        lower_traiangle = np.tril(fnc_matrix, k=-1)
        lower_traiangle = lower_traiangle[lower_traiangle != 0]
        source_data[i] = lower_traiangle
    
    source_data = np.hstack((source_data, labels))
    return source_data

fnc_dataset = convert_fnc_to_features("../original_dataset/COBRE.mat")