In [2]:
from load_data import data_loader

fa, func, gm = data_loader()

Extracting all the files now...
Done!


### Creating a reference brain network from the data gathered in control group.
In order to detect alterations in patients brain network we need a reference of a healthy brain network. Therefore, we will create such a reference with data obtained from control group (N=18).

The matrices introduced in the data frames ('fa', 'func', 'gm') contain 0 values (see below). Zero value in a matrix means that there is no connection between two regions (matrices: 'func' and 'gm') or there is no white matter tract ('fa'). However, due to preprocessing steps the matrices of particular subjects may contain artefacts i.a. showing a connection (giving a value greater than 0), where there actually isn't one. To diminish the influence of these artefacts, we made an assumption that there is a connection between brain regions if values in 60% (N= 11) of the control subjects are >0. This was calculated in the following steps:

<html>Step 1: Create a dataframe containing data from the control group only.<br/>
Step 2: Create a list of columns that contains at least 11 non-zero values.<br/>
Step 3: Create new, filtered dataframes containing data of patients and controls using the list from the step 3.</html>

#### Step 1: Create a dataframe containing data from the control group only.

In [50]:
#Import information about patients and control group from "subject_clinical_data.xlsx"
import pandas as pd
clinical_data = pd.read_excel('data\subject_clinical_data.xlsx', index_col = 0, usecols = ["id", "controls_ms"])
clinical_data.reset_index(inplace = True)
print(clinical_data.shape)
clinical_data.head()

(165, 2)


Unnamed: 0,id,controls_ms
0,002MSVIS,1
1,003MSVIS,1
2,004MSVIS,1
3,005MSVIS,1
4,010MSVIS,1


In [51]:
#join information about the group with matrices fa, func, gm
list_of_matrices = [fa,func,gm]
new_names = ["fa_c", "func_c", "gm_c"] #fa_c - fa controls etc.
new_dfs ={}
for name,m in zip(new_names,list_of_matrices):
    new_dfs[name] = m.merge(clinical_data,left_index = True, right_on = 'id')
    new_dfs[name] = new_dfs[name][new_dfs[name].controls_ms == 0]
    print(name, new_dfs[name].shape)
    
locals().update(new_dfs)


fa_c (18, 2852)
func_c (18, 2852)
gm_c (18, 2852)


#### Step 2: Create a list of columns that contains at least 11 non-zero values.

In [57]:
# Counting number of 0-values in each column for fa, func, and gm matrices
import numpy as np
list_of_matrices = [fa_c, func_c, gm_c]
list_of_names= ["fa_z", "func_z", "gm_z"]
list_of_columns = list(fa_c.columns)
Dict_Value= {}
for n,m in zip(list_of_names, list_of_matrices): #creating lists with numbers of 0-values for each column in a dataframe
    Dict_Value[n] = []
    #list_of_columns = list(m.columns)
    for col in list_of_columns:
        serie = m[col]
        x = np.count_nonzero(serie==0)
        Dict_Value[n].append(x)

In [53]:
# Creating lists of dataframe columns with values >0 in at least 11 subjects.
fa_z = pd.DataFrame(data = Dict_Value['fa_z'], index = list_of_columns, columns = ['Nof0'])
fa_con = fa_z[fa_z.Nof0 < 8].index.values.tolist()
fa_con.remove("id")
print("FA - number of connections:", len(fa_con))
func_z = pd.DataFrame(data = Dict_Value['func_z'], index = list_of_columns, columns = ['Nof0'])
func_con = func_z[func_z.Nof0 < 8].index.values.tolist()
func_con.remove("id")
print("FUNC - number of connections:", len(func_con))
gm_z = pd.DataFrame(data = Dict_Value['gm_z'], index = list_of_columns, columns = ['Nof0'])
gm_con = gm_z[gm_z.Nof0 < 8].index.values.tolist()
gm_con.remove("id")
print("GM - number of connections:", len(gm_con))

FA - number of connections: 1798
FUNC - number of connections: 2850
GM - number of connections: 2321


In [54]:
# Applying selection to the initial matrices including patients and control (fa,func,gm)
fa_network = fa[fa_con]
print(fa_network.shape)
func_network = func[func_con] #this dataframe remains unchanged
print(func_network.shape)
gm_network = gm[gm_con]
print(gm_network.shape)

(165, 1798)
(165, 2850)
(165, 2321)


# This is for further analysis

In [61]:
clinical_data.head()

Unnamed: 0,id,redcap_event_name,controls_ms,dob,msonset,nrldate,age,sex,dd,mstype,...,oGMSSS,sdmt,sdmtz,pasat,zpasat,zverbalmemory,zvisualmemory,zattention,zfluency,zglobal
0,002MSVIS,year3_arm_1,1,1963-09-04,1993-01-10,2015-03-16,51.53,1,22.18,1,...,8.103,,,55.0,0.2857,,,,,
1,003MSVIS,year5_arm_1,1,1959-01-18,2007-07-15,2017-02-08,58.06,0,9.57,2,...,7.385,38.0,-0.777,25.0,-2.66,,,,,
2,004MSVIS,year5_arm_1,1,1956-09-16,2010-09-15,2017-06-29,60.78,1,6.79,0,...,4.966,61.0,1.625,58.0,0.7142,,,,,
3,005MSVIS,year5_arm_1,1,1978-02-01,2007-08-01,2016-01-13,37.95,0,8.45,0,...,1.922,37.0,-0.5384,,,,,,,
4,010MSVIS,year5_arm_1,1,1964-02-13,2007-09-15,2016-10-04,52.64,0,9.05,0,...,2.649,,,40.0,-0.5,,,,,


In [62]:
clinical_data.dtypes

id                           object
redcap_event_name            object
controls_ms                   int64
dob                  datetime64[ns]
msonset              datetime64[ns]
nrldate              datetime64[ns]
age                         float64
sex                           int64
dd                          float64
mstype                        int64
dmd                         float64
edss                        float64
uGMSSS                      float64
oGMSSS                      float64
sdmt                        float64
sdmtz                       float64
pasat                       float64
zpasat                      float64
zverbalmemory               float64
zvisualmemory               float64
zattention                  float64
zfluency                    float64
zglobal                     float64
dtype: object

In [63]:
clinical_data.describe()

Unnamed: 0,controls_ms,age,sex,dd,mstype,dmd,edss,uGMSSS,oGMSSS,sdmt,sdmtz,pasat,zpasat,zverbalmemory,zvisualmemory,zattention,zfluency,zglobal
count,165.0,165.0,165.0,165.0,165.0,146.0,165.0,147.0,147.0,122.0,122.0,136.0,125.0,64.0,64.0,62.0,64.0,62.0
mean,0.890909,46.161529,0.278788,14.110788,0.060606,2.527397,2.369697,3.031156,2.501483,47.97541,0.079029,44.617647,-0.289255,-0.815819,-0.495052,-0.285719,-0.657595,-0.554303
std,0.312702,10.602227,0.449768,10.051942,0.591577,3.167062,1.802689,2.008814,1.960155,13.351239,1.064729,12.49596,1.333481,1.296972,1.058694,1.038537,1.113514,0.828612
min,0.0,22.66,0.0,-1.0,-1.0,0.0,0.0,0.49,0.234,13.0,-4.1667,8.0,-5.1429,-3.8056,-3.125,-3.2143,-3.5,-3.1612
25%,1.0,38.21,0.0,7.75,0.0,0.0,1.5,1.415,0.9395,38.0,-0.538475,38.75,-0.8571,-1.68945,-1.4,-0.868,-1.5,-0.975425
50%,1.0,46.33,0.0,12.4,0.0,1.0,2.0,2.53,1.922,49.0,0.198059,48.5,0.0,-0.6642,-0.6375,-0.11045,-0.4643,-0.4244
75%,1.0,53.84,1.0,19.6,0.0,4.75,3.0,4.395,3.488,56.0,0.638425,54.0,0.7,0.09025,0.3125,0.41405,0.14885,-0.057925
max,1.0,72.22,1.0,46.13,2.0,14.0,7.5,9.29,9.13,92.0,3.0,60.0,1.75,1.9103,1.55,1.9872,1.8333,0.9208


In [58]:
clinical_data.isna().sum()

id                     0
redcap_event_name     18
controls_ms            0
dob                    8
msonset               18
nrldate                8
age                    0
sex                    0
dd                     0
mstype                 0
dmd                   19
edss                   0
uGMSSS                18
oGMSSS                18
sdmt                  43
sdmtz                 43
pasat                 29
zpasat                40
zverbalmemory        101
zvisualmemory        101
zattention           103
zfluency             101
zglobal              103
dtype: int64

In [None]:
clinical = clinical_data[['id', 'redcap_event_name', 'controls','age', 'msonset','mstype','sex', 'dmd', 'edss','uGMSSS','oGMSS','sdmt','sdmtz']]

In [52]:
fa_n_clinical = fa.merge(clinical_data,left_index = True, right_on = 'id')

In [54]:
print(fa.shape)
print(fa_n_clinical.shape)
fa_n_clinical.head()

(165, 2850)
(165, 2873)


Unnamed: 0,ctx-lh-caudalanteriorcingulate/ctx-lh-caudalmiddlefrontal,ctx-lh-caudalanteriorcingulate/ctx-lh-cuneus,ctx-lh-caudalanteriorcingulate/ctx-lh-entorhinal,ctx-lh-caudalanteriorcingulate/ctx-lh-fusiform,ctx-lh-caudalanteriorcingulate/ctx-lh-inferiorparietal,ctx-lh-caudalanteriorcingulate/ctx-lh-inferiortemporal,ctx-lh-caudalanteriorcingulate/ctx-lh-isthmuscingulate,ctx-lh-caudalanteriorcingulate/ctx-lh-lateraloccipital,ctx-lh-caudalanteriorcingulate/ctx-lh-lateralorbitofrontal,ctx-lh-caudalanteriorcingulate/ctx-lh-lingual,...,oGMSSS,sdmt,sdmtz,pasat,zpasat,zverbalmemory,zvisualmemory,zattention,zfluency,zglobal
0,0.395567,0.0,0.0,0.0,0.382439,0.392177,0.490902,0.0,0.356278,0.0,...,8.103,,,55.0,0.2857,,,,,
1,0.330618,0.0,0.0,0.0,0.357768,0.346572,0.45775,0.329869,0.354733,0.0,...,7.385,38.0,-0.777,25.0,-2.66,,,,,
2,0.418653,0.0,0.0,0.0,0.408343,0.0,0.492659,0.419281,0.387314,0.0,...,4.966,61.0,1.625,58.0,0.7142,,,,,
3,0.378885,0.0,0.0,0.0,0.394806,0.420584,0.478323,0.368988,0.306003,0.0,...,1.922,37.0,-0.5384,,,,,,,
4,0.354844,0.0,0.0,0.0,0.390623,0.404013,0.4526,0.0,0.366199,0.0,...,2.649,,,40.0,-0.5,,,,,


0    002MSVIS
1    003MSVIS
2    004MSVIS
3    005MSVIS
4    010MSVIS
5    011MSVIS
Name: id, dtype: object