In [1]:
import h5py
from matplotlib.font_manager import findSystemFonts
from scipy.io import loadmat

def iterate_group(data: h5py.Group, final_data: dict):
    """
    Recursively iterate through a h5py Group and print its structure.
    """
    for key in data.keys():
        item = data[key]
        if isinstance(item, h5py.Group):
            final_data[key] = {}
            iterate_group(item, final_data[key])
        else:
            final_data[key] = item[:]
    return final_data

def load_data_matfile(path: str, name: list[str]=[]):
    with h5py.File(path, 'r') as f:
        # List all groups in the file
        # print("Keys in the file:", list(f.keys()))
        # Access the dataset
        
        if len(name) == 0:
            data = {}
            iterate_group(f, data)
            return data
        else:
            result = {}
            for key in name:
                if isinstance(f[key], h5py.Group):
                    # print("key is a group, iterating through it")
                    result[key] = iterate_group(f[key], {})
                else:
                    # print("key is a dataset, returning data")
                    data = f[key][:]
                    data = data.T  # Transpose to match MATLAB's column-major order
                    result[key] = data 
            return result

def load_result_matfile(path: str):
    data = loadmat(path)
    return data

In [3]:
import os

def handle_count_dataset(name: str, data: h5py.Dataset):
    print(f"====== Count: {name} =====")
    
    # Shape of the opbject and type of it : (311, 4) and ndarray
    """
        1st column: subject Ids
        2nd total no.of times it sampled
        3rd total no.of times it is non-noised
        4th % of non-noise labelled
    """
    print(f"Total Subjects: {data.shape[0]}")

def get_stats(site: str):
    print('Dataset: ', site)
    
    PATH="/data/users4/rgirijala1/msproject/result/round_{number}"
    
    typical_sz = {} 
    typical_hc = {}
    rounds=2
    for i in range(rounds):
        path = PATH.format(number=i)

        count_path = os.path.join(path, f'{site}_Count.mat')
        typ_path = os.path.join(path, f'{site}_Typ.mat')
        score_path = os.path.join(path, f'{site}_Score.mat')

        count_mat = load_result_matfile(count_path)['count'][:]
        score_mat = load_result_matfile(score_path)
        typ_mat = load_result_matfile(typ_path)
        
        typ_sz = typ_mat['TypIDG1'].T[0]
        typ_hc = typ_mat['TypIDG2'].T[0]
        
        print(f"Round {i}: ")
        print("total typical subjects: ", len(typ_sz)+len(typ_hc))
        print("total typical SZ subjects: ", len(typ_sz))
        print("total typical HC subjects: ", len(typ_hc))
        
        for i in typ_hc:
            id = int(i)
            if id in typical_hc:
                typical_hc[id]+=1
            else:
                typical_hc[id]=1

        for i in typ_sz:
            id=int(i)
            if id in typical_sz:
                typical_sz[id]+=1
            else:
                typical_sz[id]=1

    print("SZ: Typical Subjects present in all the rounds: ", len([key for key, value in typical_sz.items() if value == rounds]))
    print("SZ: Inconsistent: ", len([key for key, value in typical_sz.items() if value != rounds]))
    
    print("HC: Typical Subjects present in all the rounds: ", len([key for key, value in typical_hc.items() if value == rounds]))
    print("HC: Inconsistent: ", len([key for key, value in typical_hc.items() if value != rounds]))


names=['FBIRN', 'COBRE']

for site in names:
    get_stats(site)
    print()


Dataset:  FBIRN
Round 0: 
total typical subjects:  191
total typical SZ subjects:  99
total typical HC subjects:  92
Round 1: 
total typical subjects:  191
total typical SZ subjects:  99
total typical HC subjects:  92
SZ: Typical Subjects present in all the rounds:  99
SZ: Inconsistent:  0
HC: Typical Subjects present in all the rounds:  92
HC: Inconsistent:  0

Dataset:  COBRE
Round 0: 
total typical subjects:  103
total typical SZ subjects:  55
total typical HC subjects:  48
Round 1: 
total typical subjects:  103
total typical SZ subjects:  55
total typical HC subjects:  48
SZ: Typical Subjects present in all the rounds:  55
SZ: Inconsistent:  0
HC: Typical Subjects present in all the rounds:  48
HC: Inconsistent:  0



In [26]:
import numpy as np


DataName = ['FBIRN', 'COBRE']

for i in DataName:
    content = load_data_matfile(f'data/{i}.mat', [i])
    data = content[i]
    
    print('Shape of the dataset: ', i) # no.of subjects with features columns
    print(data.shape)
    
    print('last column have the labels for each subject') # 1-SZ , 2-HC
    last_column = data[:,-1]
    print(last_column[:10])
    
    total_subjects = data.shape[0]
    sz_count = np.count_nonzero(last_column == 1)
    hc_count = total_subjects - sz_count
    print('count of SZ: ', sz_count)
    print('count of Hc: ', hc_count)
    
    print('\n\n')

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'data/FBIRN.mat', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

### non-noise rate from CRF

In [8]:
import h5py

def handle_count_dataset(name: str, data: h5py.Dataset):
    print(f"====== Count: {name} =====")
    
    # Shape of the opbject and type of it : (311, 4) and ndarray
    """
        1st column: subject Ids
        2nd total no.of times it sampled
        3rd total no.of times it is non-noised
        4th % of non-noise labelled
    """
    print(f"Shape: {data.shape} and type of result: {type(data)}")
    
    print("first 5 rows of the count matrix")
    print(data[:5])
    
def handle_nlctlabels_dataset(name: str, data: h5py.Dataset):
    print(f"====== NLCTLabels: {name} =====")
    
    # Shape and type: 1 row and 101 columns ( total no.of iterations) , ndarray
    print(f"Shape: {data.shape} and type of result: {type(data)}")
    
    # each column is again a sampled no of rows x 2*ntrees columns
    print("Each Column data shape: ", data[0].shape) # (216, 402)
    
    # print 5 tree output for 5 sampled subjects in 1st iteration.
    # gives each tree decision label, 1->noise, 0 - non-noise
    print("first iteration, first 5 subjects label decision by 5 trees")
    print(data[0][:][:5])

def handle_nonNoise_dataset(name: str, data: h5py.Dataset):
    print(f"====== NonNoise Ids/row numbers of Dataset: {name} =====")
    
    # shape and type of dataset, 1 row and 101 columns ( total no.of iterations), ndarray
    print(data.shape, type(data))
    
    # each column is: an array of subjectId, which are non-noise
    print("first iteration subject ID's resulting non-noise")
    print(data[0][0].T)
    
DataName = ['COBRE']
for i in DataName:
    path = f'result/{i}_Count.mat'
    data = load_result_matfile(path)
    handle_count_dataset(i, data['count'][:])
    print()
    handle_nlctlabels_dataset(i, data['NLTCLabelS'][:])
    print()
    handle_nonNoise_dataset(i, data['nonNoiseDataInd'][:])
    
    print("\n\n")


Shape: (157, 4) and type of result: <class 'numpy.ndarray'>
first 5 rows of the count matrix
[[ 1.         81.         80.          0.98765432]
 [ 2.         86.         86.          1.        ]
 [ 3.         76.         75.          0.98684211]
 [ 4.         54.         54.          1.        ]
 [ 5.         62.         62.          1.        ]]

Shape: (1, 101) and type of result: <class 'numpy.ndarray'>
Each Column data shape:  (101,)
first iteration, first 5 subjects label decision by 5 trees
[array([[1, 0, 1, ..., 1, 0, 0],
        [0, 1, 0, ..., 1, 1, 0],
        [0, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 0, 0, ..., 1, 0, 0],
        [0, 1, 1, ..., 0, 1, 1],
        [1, 1, 1, ..., 0, 0, 1]], shape=(108, 402), dtype=uint8)
 array([[0, 1, 1, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 1, 1],
        [1, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]], shape=(108, 402), dtype=uint8)
 array([[0, 

### Typical Subjects ID's

In [None]:
DataName = ['FBIRN', 'COBRE']

def handle_typical_groups(name: str, group: str, data: h5py.Dataset):
    print(f"Dataset: {i}", f"Group: {group}")
    
    # single row, consisting of subject ID's 
    # representing typical subjects after all the iteration 
    print(f"shape: {data.shape} and type: {type(data)}")
    print("Total no.of Typical Subjects: ", data.shape[0])
    
    data = data.T[0]
    
    print(data)

for i in DataName:
    path= f'../result/round_0/{i}_Typ.mat'
    data = load_result_matfile(path)
    
    handle_typical_groups(i, "SZ", data['TypIDG1'])
    handle_typical_groups(i, "HC", data['TypIDG2'])
    
    print('\n\n')

Dataset: FBIRN Group: SZ
shape: (99, 1) and type: <class 'numpy.ndarray'>
Total no.of Typical Subjects:  99
[  1   2   4   6   8  19  21  24  25  28  40  42  44  45  47  49  52  57
  62  63  64  69  70  73  74  78  79  80  81  87  93  94  95  99 100 102
 108 114 120 121 122 126 128 131 133 139 140 143 148 154 155 160 162 167
 168 170 171 182 186 189 194 195 198 202 203 204 205 206 210 221 225 229
 230 232 234 235 241 242 244 247 251 253 256 260 261 268 273 276 280 282
 284 287 290 291 298 300 303 304 306]
Dataset: FBIRN Group: HC
shape: (92, 1) and type: <class 'numpy.ndarray'>
Total no.of Typical Subjects:  92
[  7  10  11  12  15  16  20  23  26  27  29  34  37  43  46  48  50  53
  56  58  59  72  76  82  85  86  88  96  98 101 106 107 111 119 123 124
 127 136 141 142 145 149 151 157 161 163 164 165 172 173 180 184 188 191
 192 207 209 212 213 217 219 220 223 226 233 236 237 238 239 240 243 246
 250 258 262 263 264 266 274 275 277 278 281 285 286 288 294 295 296 305
 310 311]



Dat

### Score Analysis

- Positive ⇒ subject looks more like COBRE’s SZ-typicals
- Negative ⇒ subject looks more like COBRE’s HC-typicals
- Zero ⇒ equidistant

In [None]:
import numpy as np

def handle_Indep_Score(name: str, data: h5py.Dataset):
    print(f"======= Independent Score: {name} ========")
    
    # total subjects of current dataset name, then its classification comparing other dataset 
    print(f"Shape: {data.shape} and type: {type(data)}")
    print("top 5 subjects and there score comparing other dataset")
    print(data[:5][:])
    
    total_subjects = data.shape[0]
    avg_column = data[:, -1]
    sz_subjects_count = np.count_nonzero(avg_column >= 0)
    hc_subjects_count = np.count_nonzero(avg_column < 0)
    print("count of subjects having SZ: ", sz_subjects_count)
    print("count of subjects who are healthy: ", hc_subjects_count)
    

DataName = ['FBIRN', 'COBRE']
for i in DataName:
    path = f'result/{i}_Score.mat'
    data = load_result_matfile(path)
    handle_Indep_Score(i, data['IndepScore'][:])
    # print('\n\n')

Shape: (311, 5) and type: <class 'numpy.ndarray'>
top 5 subjects and there score comparing other dataset
[[ 1.         -0.34937957  0.          0.         -0.34937957]
 [ 1.         -0.46927604  0.          0.         -0.46927604]
 [ 1.         -0.02209151  0.          0.         -0.02209151]
 [ 1.         -0.32279348  0.          0.         -0.32279348]
 [ 2.         -0.41399117  0.          0.         -0.41399117]]
count of subjects having SZ:  173
count of subjects who are healthy:  138
Shape: (157, 5) and type: <class 'numpy.ndarray'>
top 5 subjects and there score comparing other dataset
[[-0.20494235  1.          0.          0.         -0.20494235]
 [-0.44445948  1.          0.          0.         -0.44445948]
 [-0.22080278  1.          0.          0.         -0.22080278]
 [ 0.06022404  2.          0.          0.          0.06022404]
 [ 0.53488137  2.          0.          0.          0.53488137]]
count of subjects having SZ:  42
count of subjects who are healthy:  115
