In [15]:
import os, sys
from glob import glob
from os.path import join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=False)
import json
from tqdm.notebook import tqdm
from ukbb2020_dataloader import UKBB2020

sys.path.insert(0, "../../helper/")
from plotGraphs import *

In [16]:
dataset = UKBB2020()

total subjects in df: 40682


In [17]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

In [24]:
df = dataset.get_metadata(predefined=[], cols = ["31-0.0", #sex
                                                '54-2.0', #assestment center#
                                                 '21003-2.0', #age
                                                 '26521-2.0', #total brain volume
                                                 '54-2.0', #assestment center
                                                 "20021-2.0",#SRT estimate right ear
                                                 "1558-2.0",#Alc int freq
                                                 "25061-2.0", #Mean FA in fornix on FA skeleton
                                                ],split='holdout', rename_cols=False)  

loaded following columns:
 {'31-0.0': 'Sex (s0.0)', '54-2.0': 'UK Biobank assessment centre', '21003-2.0': 'Age when attended assessment centre', '26521-2.0': 'Volume of EstimatedTotalIntraCranial (whole brain)', '20021-2.0': 'Speech-reception-threshold (SRT) estimate (right)', '1558-2.0': 'Alcohol intake frequency.', '25061-2.0': 'Mean FA in fornix on FA skeleton'}
skipping 30593/40682 subjects not belonging to holdout split


In [25]:
df['54-2.0'].value_counts()

11027.0    10089
Name: 54-2.0, dtype: int64

In [33]:
#Adds an icd label as a column
dficd = dataset.get_metadata(predefined=['icd'], cols=[], print_cols=False, split='all', rename_cols=False)
dfmooddis = dficd.apply(lambda row: row.astype(str).str.contains('F3').any(), axis=1)
df['mood_disorder'] = dfmooddis

In [34]:
df["mood_disorder"] = df["mood_disorder"].astype(float)

In [35]:
#We now convert the 6 bins of Alc freq to 3 
df['1558-2.0']= df['1558-2.0'].replace(['1.0','2.0'],'1.0') #Never/ Special -> Rarely
df['1558-2.0']= df['1558-2.0'].replace(['3.0','4.0'],'2.0') #One to 3 month/ one or two week -> ocassional drinkers
df['1558-2.0']= df['1558-2.0'].replace(['5.0','6.0'],'3.0') #3 to 4 week/daily -> frecuent drinkers
df['1558-2.0']= df['1558-2.0'].replace(['-3.0'],np.nan) #Not answered

In [36]:
#Adding a duplicate column of SRT and dividing it in 3 categories to perform classifcation
df['SRT_right_ear_classification'] = df['20021-2.0']
bins = [-12.0, -7.0, -3.0, np.inf]
categories = ['1.0', '2.0', '3.0'] #good, medium and bad srt threshold
df['SRT_right_ear_classification']= pd.cut(df['SRT_right_ear_classification'], bins, labels = categories)

In [37]:
df = df.rename(columns={'31-0.0' : 'Sex',
                        '21003-2.0' : 'Age',
                        "20021-2.0" : 'SRT_right_ear',
                        "1558-2.0" :'Alc_int_freq' ,
                        "25061-2.0" : 'Mean_FA_fornix',
                        '26521-2.0': 'Total_brain_volume',
                        '54-2.0': 'Site',
                       })

In [39]:
dataset.add_var_to_h5(df, 'Sex', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'Age', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'SRT_right_ear', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'Alc_int_freq', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'Mean_FA_fornix', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'mood_disorder', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'SRT_right_ear_classification', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'Total_brain_volume', typ='lbl', viz=False)
dataset.add_var_to_h5(df, 'Site', typ='lbl', viz=False)

Unnamed: 0_level_0,sex,age,srt_right_ear,alc_int_freq,mean_fa_fornix,mood_disorder,srt_right_ear_classification,total_brain_volume,site
subjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000014,0.0,61.0,-6.0,0.0,0.390485,0.0,1.0,1544240.0,0.0
1000023,1.0,66.0,-5.5,0.0,0.440859,0.0,1.0,1496460.0,0.0
1000030,0.0,65.0,-6.0,2.0,0.367080,0.0,1.0,1492910.0,0.0
1000041,,,,,,,,,
1000059,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
5824877,1.0,81.0,4.0,1.0,0.320599,0.0,2.0,1730900.0,0.0
5838702,0.0,66.0,-8.0,0.0,,0.0,0.0,1386490.0,0.0
5846234,0.0,57.0,-3.5,2.0,,0.0,1.0,1277430.0,0.0
5895945,0.0,75.0,-4.5,0.0,,0.0,1.0,1411350.0,0.0


In [40]:
#dataset.df_h5 = dataset.df_h5.sample(40682)

In [41]:
dataset.prepare_X(mri_col='path_T1_MNI')

n=25990 after dropping subjects with NaN


In [None]:
#%%time  ### ! I had a problem here because of the slugfy and file names are not properly set up
dataset.save_h5(filename_prefix="5tasks35k", mri_kwargs={'z_factor':(0.525)})

In [None]:
def copy_h5_mooddisorder(H5_DIR, src_h5, out_h5, h5filesize_mood_dis, h5filesize_healthy):

    ##### Fuction that creates a copy of h5file with the amount of subjects you want of each class, could be adapt to be more general
    
    with h5py.File('/ritter/share/projects/gonzalo/h5files/h5files5tasks35k.h5', 'r') as f:
        my_array = f['mood_disorder'][()]
    
        
    index_mood_dis_ = np.argwhere(my_array==True)
    indices = np.random.choice(range(len(index_mood_dis_)), replace=False, size=h5filesize_mood_dis)
    index_mood_dis = np.array(index_mood_dis_)[indices.astype(int)]
    index_mood_dis = np.sort(np.concatenate(index_mood_dis))

    index_healthy_ = np.argwhere(my_array==False)
    indices = np.random.choice(range(len(index_healthy_)), replace=False, size=h5filesize_healthy)
    index_healthy = np.array(index_healthy_)[indices.astype(int)]
    index_healthy = np.sort(np.concatenate(index_healthy))
    
    index = np.sort(np.concatenate((index_mood_dis,index_healthy)))
    h5filesize = h5filesize_mood_dis + h5filesize_healthy
    
    print("writing to ..", out_h5)
    with h5py.File(H5_DIR + out_h5, 'w') as data_out:
  
        with h5py.File(H5_DIR + src_h5, 'r') as data_src:
            for dataset in data_src:
                data_out.create_dataset(dataset, data=data_src[dataset][index][:h5filesize])
            for attr in data_src.attrs:
                data_out.attrs[attr] =data_src.attrs[attr]

In [None]:
copy_h5_mooddisorder("/ritter/share/projects/gonzalo/h5files/", "h5files5tasks35k.h5",  "h5files5tasks250_moodbalanced.h5", 75, 175)copy_h5_mooddisorder("/ritter/share/projects/gonzalo/h5files/", "h5files5tasks35k.h5",  "h5files5tasks250_moodbalanced.h5", 75, 175)

In [None]:
sys.path.insert(0,"../ML_for_alcohol_misuse/helper")
from dataloader_utils import show_h5_contents
show_h5_contents(
    ["/ritter/share/projects/gonzalo/h5files/h5files5tasks250_moodbalanced.h5"],
)