In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import h5py

In [2]:
mat_dir = r"C:/Users/leosc/OneDrive/Desktop/WiSe 21-22/Innolab/Test/SBC_01"

ex_dir = r"C:\Users\leosc\OneDrive\Desktop\WiSe 21-22\Innolab\Test"
os.chdir(mat_dir)




In [3]:
def example_av_keys():
    """
    demonstrates what the raw matrix looks like
    displays all available keys or objects that can be imported
    """
    with h5py.File('resultsROI_Subject006_Condition001.mat', 'r') as f:
        return np.array(f.get("Z")), list(f.keys())

example_av_keys()

(array([[        nan,  0.90993451,  0.22249696, ..., -0.01168092,
          0.33869942,  0.13040516],
        [ 0.90993451,         nan, -0.07541672, ..., -0.02669248,
          0.27368348,  0.12974607],
        [ 0.22249696, -0.07541672,         nan, ..., -0.0457409 ,
          0.0292011 ,  0.10908172],
        ...,
        [-0.01168092, -0.02669248, -0.0457409 , ...,         nan,
          0.15965731, -0.03555815],
        [ 0.33869942,  0.27368348,  0.0292011 , ...,  0.15965731,
                 nan,  0.70761469],
        [ 0.13040516,  0.12974607,  0.10908172, ..., -0.03555815,
          0.70761469,         nan]]),
 ['#refs#', 'DOF', 'SE', 'Z', 'names', 'names2', 'regressors', 'xyz'])

In [4]:
def col_names_matrix(n):
    """
    creates the column names for the flattened connecitvity matrix
    """
    return [str(i) + "_" + str(j)  for i in range(1, n+1) for j in range(i+1, n+1)]

col_names_matrix(5)

['1_2', '1_3', '1_4', '1_5', '2_3', '2_4', '2_5', '3_4', '3_5', '4_5']

In [9]:
def flatten_conn_matrix(matrix):
    """
    turns the connectivity matrix into a 1d array
    """
    
    if not isinstance(matrix, (np.ndarray, np.generic)):
        return "not an ndarray"
    else:
        sh = matrix.shape[0]
        return matrix[np.triu_indices(sh, k = 1)]
    
    
flatten_conn_matrix(res[0][1])

array([ 0.70011522,  0.81550539,  0.48091639, ..., -0.11764372,
       -0.09283724,  0.62196728])

In [6]:
def load_matlab_files(directory):
    """
    imports all matlab files from specified directory
    """
    
    try:
        os.chdir(directory)
    except FileNotFoundError:
        print("invalid path")
        return None
    
    mat_files_names = os.listdir()
    conn_matrices = []
    worked = []
    
    for i in mat_files_names:
            
        with h5py.File(i, 'r') as f:
            conn_matrices.append(np.array(f.get("Z")))
            worked.append(i)

    
    
    return conn_matrices, worked


In [8]:
#get subject-IDs
def get_subject_ids(file_names):
    """
    gets the subjectIDs if the filenames correspond to the 
    used format: resultsROI_Subject006_Condition001.mat
    would correspond to subject ID 6
    """
    
    return np.array([int(i.split("Subject",1)[1][0:3]) for i in file_names])


def find_missing_subject(subj_ids, max_subj_id):
    """
    gets missing subject IDs from specified range of IDs
    """

    check = np.array([i for i in range(1, max_subj_id + 1)])

    return np.setdiff1d(check, subj_ids) 

#print(get_subject_ids.__doc__)

array([160])

In [11]:
%%time

def stack_matrices(matrices):
    
    flattened = []
    for i in matrices:
        #error handling in case one matrix should not work?
        flattened.append(flatten_conn_matrix(i))
        #error handling for stacking
    
    return np.stack(flattened, axis=0)
    
stacked_conn_matrices = stack_matrices(res[0])

Wall time: 870 ms


In [7]:
%%time
res = load_matlab_files(r"C:\Users\leosc\OneDrive\Desktop\WiSe 21-22\Innolab\Test\SBC_01")

Wall time: 6.19 s


In [12]:
#add ID column
ids = get_subject_ids(res[1])
ids_added = np.c_[ids, stacked_conn_matrices]

In [13]:
#load excel
#Note: There were some rows with Apoe score in the end but no other data
excel_path = r"C:\Users\leosc\OneDrive\Desktop\WiSe 21-22\Innolab\Test\DELCODE_dataset.xlsx"
delcode_excel = pd.read_excel(excel_path)

In [14]:
#create column names
colnames = ["IDs"]
colnames = colnames + col_names_matrix(246)


In [24]:
%%time
#creating final df
final_columns = list(delcode_excel.columns) + colnames
final_df = np.c_[np.array(delcode_excel), ids_added]
final_df = pd.DataFrame(final_df, columns = final_columns)


Wall time: 5.35 s


In [None]:
#getting memory usage
final_df2.info(memory_usage="deep")

In [16]:
#first look at DF
final_df2.shape
final_df2.head()

Unnamed: 0,ConnID,Repseudonym,siteid,age,visdat,sex,prmdiag,edyears,MEM_score,Apoe,...,242_243,242_244,242_245,242_246,243_244,243_245,243_246,244_245,244_246,245_246
0,1,0a8d02f2b,11,66,17.08.2016,0,2,17,0.054016,0,...,0.655094,0.547549,-0.256346,0.0799051,0.504077,0.0314353,0.180466,0.0953076,-0.062235,0.210861
1,2,0a71a953d,17,72,30.03.2015,0,1,20,-0.468749,1,...,0.795011,0.624784,-0.115689,0.0555967,0.684795,0.212303,0.193802,-0.117644,-0.0928372,0.621967
2,3,0a61339db,11,72,13.05.2015,1,1,16,-0.093521,0,...,0.209853,0.166607,0.0561474,0.159351,0.721864,0.107519,0.329301,0.0552936,0.39387,0.269579
3,4,0b28aed58,17,76,18.01.2016,0,1,20,0.466027,0,...,0.547318,0.724798,0.205874,0.193587,0.855264,0.0857812,0.333715,0.231567,0.28501,0.780887
4,5,0c1c5ae77,8,64,12.03.2015,1,1,13,1.4521,0,...,0.901057,0.547402,-0.0375842,0.0894263,0.509276,0.122451,0.308097,0.330322,0.360479,0.404907


In [20]:
%%time
#saving to hdf file
final_df2.to_hdf('data.h5', key='df', mode='w')

Wall time: 7.42 s


In [21]:
%%time
#checking that it loads correctly
read_df = pd.read_hdf('data.h5', 'df')
read_df.head()
read_df.shape

Wall time: 9.6 s


Unnamed: 0,ConnID,Repseudonym,siteid,age,visdat,sex,prmdiag,edyears,MEM_score,Apoe,...,242_243,242_244,242_245,242_246,243_244,243_245,243_246,244_245,244_246,245_246
0,1,0a8d02f2b,11,66,17.08.2016,0,2,17,0.054016,0,...,0.655094,0.547549,-0.256346,0.0799051,0.504077,0.0314353,0.180466,0.0953076,-0.062235,0.210861
1,2,0a71a953d,17,72,30.03.2015,0,1,20,-0.468749,1,...,0.795011,0.624784,-0.115689,0.0555967,0.684795,0.212303,0.193802,-0.117644,-0.0928372,0.621967
2,3,0a61339db,11,72,13.05.2015,1,1,16,-0.093521,0,...,0.209853,0.166607,0.0561474,0.159351,0.721864,0.107519,0.329301,0.0552936,0.39387,0.269579
3,4,0b28aed58,17,76,18.01.2016,0,1,20,0.466027,0,...,0.547318,0.724798,0.205874,0.193587,0.855264,0.0857812,0.333715,0.231567,0.28501,0.780887
4,5,0c1c5ae77,8,64,12.03.2015,1,1,13,1.4521,0,...,0.901057,0.547402,-0.0375842,0.0894263,0.509276,0.122451,0.308097,0.330322,0.360479,0.404907
