In [35]:
import mat73
import pandas as pd
import numpy as np
import scipy.io
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

### Data loading and preprocessing

In [2]:
import os

DATA_DIR = ''
if 'google.colab' not in str(get_ipython()) and "anuja" in os.environ.get('USER'):
    DATA_DIR = 'data/'
    

In [3]:
foof = pd.read_csv(DATA_DIR+"foof2features.csv")
foof = foof.rename(columns={"C1": "IDs" ,"C2": "Intercept", "C3": "Slope"})
foof

Unnamed: 0,IDs,Intercept,Slope
0,NDARAA075AMK,0.986272,1.825774
1,NDARAA112DMH,1.486650,1.888544
2,NDARAA117NEJ,1.593155,2.095749
3,NDARAA947ZG5,0.703331,1.724831
4,NDARAA948VFH,0.918020,1.749441
...,...,...,...
2037,NDARZN277NR6,1.351549,1.996940
2038,NDARZN578YDP,1.380795,2.036327
2039,NDARZN610GTY,0.339229,1.050644
2040,NDARZN677EYE,0.781225,1.470061


In [33]:
data = scipy.io.loadmat(DATA_DIR+'x.mat')  
df = pd.DataFrame(data['x'].reshape((data['x'].shape[0], -1)))

# sparsing
df = np.array(df).reshape(data['x'].shape)
df_sparsed = np.concatenate([np.expand_dims(df[:,:,i:i+2].mean(axis = 2), axis = 2) for i in range(0, data['x'].shape[2]-2, 2)], axis = 2)
df = pd.DataFrame(df_sparsed.reshape((df_sparsed.shape[0], -1)))

columns = np.asarray([['Electrode %d - %d/2 Hz'%(i+1, j+1)] for i in range(df_sparsed.shape[1]) for j in range(df_sparsed.shape[2])])
df.columns = columns
df['IDs'] = foof['IDs']
df

Unnamed: 0,"(Electrode 1 - 1/2 Hz,)","(Electrode 1 - 2/2 Hz,)","(Electrode 1 - 3/2 Hz,)","(Electrode 1 - 4/2 Hz,)","(Electrode 1 - 5/2 Hz,)","(Electrode 1 - 6/2 Hz,)","(Electrode 1 - 7/2 Hz,)","(Electrode 1 - 8/2 Hz,)","(Electrode 1 - 9/2 Hz,)","(Electrode 1 - 10/2 Hz,)",...,"(Electrode 105 - 31/2 Hz,)","(Electrode 105 - 32/2 Hz,)","(Electrode 105 - 33/2 Hz,)","(Electrode 105 - 34/2 Hz,)","(Electrode 105 - 35/2 Hz,)","(Electrode 105 - 36/2 Hz,)","(Electrode 105 - 37/2 Hz,)","(Electrode 105 - 38/2 Hz,)","(Electrode 105 - 39/2 Hz,)",IDs
0,7.946821e-11,1.850668e-08,0.000004,0.000712,0.027596,0.163782,0.309340,0.670886,1.008531,0.762531,...,0.000005,0.000001,2.964619e-07,6.217296e-08,1.195363e-08,2.106983e-09,3.404723e-10,5.043821e-11,6.849965e-12,NDARAA075AMK
1,1.009652e-01,2.497600e-01,0.045617,0.144905,0.146948,0.183639,0.164381,0.189891,0.084780,0.008961,...,0.022845,0.007058,1.609008e-03,2.734379e-04,3.568399e-05,3.846004e-06,3.925487e-07,4.327284e-08,5.101469e-09,NDARAA112DMH
2,2.255039e-05,2.662278e-03,0.057796,0.245632,0.236164,0.187468,0.302816,0.477905,0.895206,0.971402,...,0.000224,0.000059,1.399566e-05,2.947257e-06,5.540105e-07,9.295798e-08,1.392249e-08,1.861238e-09,2.220925e-10,NDARAA117NEJ
3,1.856998e-01,2.340013e-01,0.232717,0.200216,0.205096,0.066722,0.021296,0.006475,0.006776,0.116951,...,0.114594,0.112846,1.080848e-01,1.006935e-01,9.124210e-02,8.041673e-02,6.893738e-02,5.748047e-02,4.661686e-02,NDARAA947ZG5
4,8.178513e-05,7.337333e-03,0.102981,0.246312,0.128874,0.118656,0.271443,0.413808,0.396238,0.238277,...,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,NDARAA948VFH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2037,1.510286e-04,1.856862e-02,0.119712,0.054523,0.015271,0.087643,0.250952,0.430391,0.708924,0.872950,...,0.067446,0.044021,2.744773e-02,1.634931e-02,9.303291e-03,5.057293e-03,2.626296e-03,1.302906e-03,6.174846e-04,NDARZN277NR6
2038,9.085073e-03,1.375604e-02,0.020259,0.029020,0.040476,0.056668,0.104675,0.318760,0.756034,0.882062,...,0.012155,0.007197,4.020544e-03,2.116391e-03,1.049036e-03,4.894123e-04,2.148476e-04,8.873293e-05,3.447416e-05,NDARZN578YDP
2039,7.144546e-03,2.428089e-02,0.085359,0.193528,0.252609,0.223340,0.267940,0.491500,0.702337,0.653069,...,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,NDARZN610GTY
2040,4.204130e-03,1.740749e-02,0.058676,0.161022,0.359780,0.654564,0.969781,1.170376,1.151998,0.930774,...,0.042771,0.019556,7.997873e-03,2.925627e-03,9.571861e-04,2.800902e-04,7.330213e-05,1.715719e-05,3.591535e-06,NDARZN677EYE


In [44]:
data = mat73.loadmat(DATA_DIR+'x_source.mat')  
df2 = pd.DataFrame(data['x'].reshape((data['x'].shape[0], -1)))

# sparsing
df2 = np.array(df2).reshape(data['x'].shape) 
df2_sparsed = np.concatenate([np.expand_dims(df2[:,:,i:i+10].mean(axis = 2), axis = 2) for i in range(0, data['x'].shape[2]-10, 10)], axis = 2)
df2 = pd.DataFrame(df2_sparsed.reshape((df2_sparsed.shape[0], -1)))

df2['IDs'] = foof['IDs']
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2643,2644,2645,2646,2647,2648,2649,2650,2651,IDs
0,5.478741e-01,5.149596e-01,3.400973e-01,4.454973e-01,0.349062,0.697324,0.691272,1.045694,1.204504,1.074292,...,0.467164,0.481904,0.481258,0.463432,0.429230,0.385794,0.521068,0.271498,0.209002,NDARAA075AMK
1,1.169312e+00,1.016107e+00,5.849121e-01,8.296416e-01,0.531368,0.432419,0.424141,0.333437,0.335283,0.328013,...,0.596011,0.324816,0.354518,0.166953,0.613057,0.565442,0.370370,0.390634,0.522705,NDARAA112DMH
2,3.453342e-01,4.000215e-01,4.507422e-01,4.940988e-01,0.527302,0.551810,0.586962,0.671988,0.990649,0.753007,...,0.559853,0.372586,0.556390,0.432241,0.215436,0.206292,0.415330,0.750851,0.635401,NDARAA117NEJ
3,9.941514e-02,1.352644e-01,1.910004e-01,9.016700e-01,0.557829,0.504297,0.400133,0.453346,0.531803,0.668781,...,0.149746,0.054519,0.037378,0.387176,0.117666,0.010380,0.006441,0.288770,0.395463,NDARAA947ZG5
4,4.320588e-01,2.533101e-01,5.588155e-01,5.558113e-01,0.352180,0.392321,0.425225,0.448403,0.460220,0.460162,...,0.227232,0.177278,0.135861,0.101274,0.073429,0.051784,0.035522,0.023700,0.015381,NDARAA948VFH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,5.317443e-01,5.369550e-01,6.276645e-01,4.888022e-01,0.404472,0.522272,0.805762,0.687398,0.537357,0.409353,...,0.372143,0.359792,0.360504,0.371701,0.389350,0.408550,0.424263,0.432074,0.428843,NDARZN148PMN
2037,1.680173e-01,2.296686e-01,3.020971e-01,3.823951e-01,0.465840,0.546239,0.616664,0.670505,0.702613,0.710307,...,0.625804,0.263094,0.138545,0.106510,0.306391,0.446338,0.321081,0.543881,0.009794,NDARZN277NR6
2038,4.388883e-01,4.310267e-01,4.137126e-01,3.893806e-01,0.361110,0.332213,0.305792,0.463370,0.775685,0.588367,...,0.535796,0.596240,0.650161,0.692566,0.719370,0.727835,0.716863,0.687083,0.640713,NDARZN578YDP
2039,5.107026e-15,3.788081e-14,2.666312e-13,1.770273e-12,0.000010,0.180625,1.032138,1.204632,0.937709,0.456522,...,0.296184,0.252212,0.208898,0.168293,0.131875,0.100513,0.074515,0.053732,0.037686,NDARZN610GTY


In [45]:
beh = pd.read_csv(DATA_DIR+"behaviorals.csv")
print('Before:'+str(beh.shape))

most_common_disorders = ['Attention-Deficit/Hyperactivity Disorder', 'Anxiety Disorders', 'Specific Learning Disorder',
                         'Autism Spectrum Disorder', 'Disruptive', 'No Diagnosis Given', 'Communication Disorder',
                         'Depressive Disorders']

# most_common_disorders = ['Other Neurodevelopmental Disorders', 'ADHD-Inattentive Type', 'ADHD-Combined Type', 'Anxiety Disorders', 'No Diagnosis Given', 'Depressive Disorders']

category_columns = ['DX_' + str(i).zfill(2) + '_Cat' for i in range(1, 11)] +\
                   ['DX_' + str(i).zfill(2) + '_Sub' for i in range(1, 11)]

# find users that have no diagnosis within these top diseases
# filtering should cahnge anything as this should also happen at a later stage
mask = None
for col in category_columns:
    mask_col = beh[col].isin(most_common_disorders)
    if mask is None:
        mask = mask_col
    else:
        mask = mask | mask_col

initial_size = beh.shape[0]
beh = beh[mask]
beh = beh.reset_index(drop=True)
new_size = beh.shape[0]
print('After:'+str(beh.shape))
print('Removing', initial_size - new_size,
      'patients as their diagnoses were very uncommon.')

Before:(3076, 177)
After:(2813, 177)
Removing 263 patients as their diagnoses were very uncommon.


In [46]:
no_diagnosis_given = 'No Diagnosis Given'

if no_diagnosis_given in most_common_disorders:
    no_diag_index = most_common_disorders.index(no_diagnosis_given)
    most_common_disorders = most_common_disorders[:no_diag_index] + \
        most_common_disorders[no_diag_index + 1:]

diagnoses_to_ids = {disorder: i for i, disorder in enumerate(most_common_disorders)}
diagnoses_to_ids

{'Attention-Deficit/Hyperactivity Disorder': 0,
 'Anxiety Disorders': 1,
 'Specific Learning Disorder': 2,
 'Autism Spectrum Disorder': 3,
 'Disruptive': 4,
 'Communication Disorder': 5,
 'Depressive Disorders': 6}

In [47]:
def get_disorder(data, row, index):
    disorder = data.iloc[row][category_columns[index]]

    if disorder == 'Neurodevelopmental Disorders':
        disorder = data.iloc[row][category_columns[index + 10]]

    return disorder

order_of_disorders = []
for k in range(beh.shape[0]):
    i = 0
    disorder = get_disorder(beh, k, i)
    disorders_patient = []
    while disorder != no_diagnosis_given and not pd.isnull(disorder):
        if disorder in diagnoses_to_ids:
            if diagnoses_to_ids[disorder] not in disorders_patient:
                disorders_patient.append(diagnoses_to_ids[disorder])
        i += 1
        if i == len(category_columns):
            break
        disorder = get_disorder(beh, k, i)

    order_of_disorders.append(disorders_patient)


In [48]:
max_len_order = np.max([len(x) for x in order_of_disorders])

# pad with a new token denoting the pad token
pad_token = len(most_common_disorders)
bod_token = len(most_common_disorders) + 1
eod_token = len(most_common_disorders) + 2

order_of_disorders = [[bod_token] + x + [eod_token] + [pad_token] * (max_len_order - len(x)) for x in order_of_disorders]

order_of_disorders = np.array(order_of_disorders)

classes = np.zeros((len(most_common_disorders),
                    beh.shape[0]), dtype=np.int32)

df_disorders = beh[category_columns]

for i, disorder in enumerate(most_common_disorders):
    mask = df_disorders.select_dtypes(include=[object]). \
        applymap(lambda x: disorder in x if pd.notnull(x) else False)

    disorder_df = df_disorders[mask.any(axis=1)]

    np.add.at(classes[i], disorder_df.index.values, 1)

behaviour_data_columns = beh.columns.values.astype(np.str)

columns_to_drop = behaviour_data_columns[
    np.flatnonzero(np.core.defchararray.find(behaviour_data_columns, 'DX') != -1)]

behaviour_data = beh.drop(columns=columns_to_drop)

for disorder, classification in zip(most_common_disorders, classes):
    behaviour_data[disorder] = classification

behaviour_data['order_diagnoses'] = list(order_of_disorders)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  behaviour_data_columns = beh.columns.values.astype(np.str)


In [49]:
labels=behaviour_data[["IDs"]+list(most_common_disorders)]
labels

Unnamed: 0,IDs,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,Communication Disorder,Depressive Disorders
0,NDARAA075AMK,0,0,0,0,0,0,0
1,NDARAA112DMH,1,0,0,0,1,0,0
2,NDARAA117NEJ,1,0,0,0,1,0,0
3,NDARAA306NT2,1,1,1,0,0,1,0
4,NDARAA504CRN,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...
2808,NDARZZ007YMP,0,0,0,1,0,0,0
2809,NDARZZ740MLM,1,0,0,0,0,0,0
2810,NDARZZ810LVF,0,0,0,1,0,1,0
2811,NDARZZ830JM7,0,0,0,1,0,0,0
