In [86]:
import pandas as pd
import numpy as np
import re
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
path="../../data/"

In [87]:
year=2021
month=1
day=1

In [126]:
config = dict(time_points = ['1','2','3'])

conditions=['AD','PD','FTD','EPIL','DIAB','COVID','CERVASC','STROKE','CERVASCALL','CELIAC','SLEEP_DIS']
icd10dis={
    'AD': "F00|F000|F001|F002|F009|G30|G300|G301|G308|G309",
    'PD': "G20",
    'FTD':"G31",
    'EPIL':"G40",
    'DIAB':"E10|E11|E13|E14",
    #'DIAB':"E08|E09|E10|E11|E13",
    'COVID':"U07",
    'CERVASC':"I67", #show I63-I69
    'STROKE':"I63", #show I63-I69
    'CERVASCALL':"I60|I61|I62|I63|I64|I65|I66|I67|I68|I69",
    'CELIAC':"K90",
    'SLEEP_DIS':'G47'
}

icd9dis={
    'AD': "F00|F000|F001|F002|F009|G30|G300|G301|G308|G309",
    'PD': "G20",
    'FTD':"G31",
    'EPIL':"G40",
    'DIAB':"E11",
    'COVID':"U07",
    'CERVASC':"I67",
    'CELIAC':"579",
    'REM_DIS':'G47.52'
}

In [124]:
'''
Helper function to find out columns containing specific text
'''
def findcols(df,text):
    return [col for col in df.columns if text in col]

In [125]:
def read_data(file,path=path,chunksize=10000):
    df_chunk = pd.read_csv('%s%s' % (path,file), chunksize=chunksize, iterator=True,sep=',')#,nrows=1000)
    df = pd.concat(df_chunk, ignore_index=True)
    return df

#create independent variable set
def createindepvars(df,tps=config['time_points']):
    cols=np.asarray(df.columns)
    colselected=np.asarray([])
    timeperiod=np.asarray([])
    newcolnames=np.asarray([])

    for i,s in enumerate(cols):

        t=s[len(s)-6:len(s)]
        u=s[len(s)-2:len(s)]
        if u=="_0":
            colselected=np.append(colselected,s)
            inds = [i for i,c in enumerate(s) if c=='_']
            t=s[inds[len(inds)-2]+1:inds[len(inds)-1]]
            u=s[0:inds[len(inds)-2]]
            timeperiod=np.append(timeperiod,t)
            newcolnames=np.append(newcolnames,u)

    df_out=pd.DataFrame([])

    #for i in range(int(max(timeperiod))):
    for i in tps:
        indices=[j for j,c in enumerate(timeperiod) if c==str(i)]
        dfa=df[colselected[indices]]

        #print(colselected[indices])
        dfa.columns=newcolnames[indices]
        dfa['eid']=df['eid']
        dfa['time_point']=str(i)
        df_out=pd.concat([df_out,dfa],axis=0)
    print(df_out.shape)
    return df_out

def returndata(df,tp,minvals=50000):
    df=df[(df['time_point']==tp)]
    cols=[col for col in df.columns if df[col].count()>minvals]
    df=df[cols]
    return df

def returndisease(df,conditions=conditions,icd10dis=icd10dis):
    
    conditions_diag_date=['Diag_Date_'+disease for disease in conditions]
    conditions_age_diag=['Age_Diag_'+disease for disease in conditions]
    conditions_years_bef=['years_bef_diag'+disease for disease in conditions]
    for i,disease in enumerate(conditions):
        df[disease]=0
    
        disease_num=disease+'_num'
        disease_diag_date=conditions_diag_date[i]
        disease_diag_age=conditions_age_diag[i]
        disease_years_bef=conditions_years_bef[i]
        df[disease_diag_date]=pd.NaT
        df[disease_diag_age]=0
        df[disease_years_bef]=0
        df[disease_num]='none'
        
        #print(df.columns)
        for s in df.columns:
            if s!="n_eid" and '41270' in s:
                uscloc=[j for j,s1 in enumerate(s) if s1=='_']
                mask=(df[s].str.contains(icd10dis[disease]))&pd.notnull(df[s])

                df[disease][mask]=1 
                number='num'+str(s[uscloc[len(uscloc)-1]+1:len(s)])
                df[disease_num][mask]=number

        for i,s in enumerate(df.columns):
            if s!="n_eid" and '41280' in s:
                uscloc=[j for j,s1 in enumerate(s) if s1=='_']
                number2='num'+str(s[uscloc[len(uscloc)-1]+1:len(s)])
                mask_num=(df[disease_num]==number2)
                df[disease_diag_date][mask_num]=df[s].astype(str).apply(lambda x:x[2:12])
                
                try:
                    df[disease_diag_age][(pd.notnull(df[disease_diag_date]))]=\
                    df['age_when_attended_assessment_centre_f21003']+(pd.to_datetime(df[disease_diag_date])-\
                    pd.to_datetime(df['date_of_attending_assessment_centre_f53'])).dt.days/365
                
                except:
                    pass
                             
                try:
                    df[disease_years_bef]=(pd.to_datetime(df[disease_diag_date])-\
                    pd.to_datetime(df['date_of_attending_assessment_centre_f53'])).dt.days/365
                except:
                    pass

    df_out=df[np.concatenate([['n_eid','date_of_attending_assessment_centre_f53','age_when_attended_assessment_centre_f21003'],
                              conditions,conditions_diag_date,conditions_age_diag,conditions_years_bef])]
    
    return df_out

In [5]:
genotype=read_data("ukb_genotype.csv")
ukb=read_data("all_fields_251020.csv")
customfields=read_data("ukb_customfields.csv")
genotype=read_data("ukb_genotype.csv")
death_codes=read_data("hm_cod_update_02sep2020.csv")
inpatient_update=read_data("inpatient_update.csv")

In [6]:
customfields=pd.merge(customfields,genotype,on='eid',how='left')
customfields=customfields[['eid','parental_ad_status','APOE4_Carriers','APOE4_assigned','Genotype']]

In [7]:
death_codes=death_codes[['n_eid','s_40000_0_0']]
death_codes['death']=1
death_codes.columns=['eid','death_date','death']

In [96]:
%%time
ukb_tp0=createindepvars(ukb,tps='0')

(502523, 1516)
CPU times: user 4min 3s, sys: 12min 30s, total: 16min 33s
Wall time: 24min 31s


In [97]:
ukb_tp=createindepvars(ukb,tps=config['time_points'])
ukb_tp_brain=ukb_tp[[col for col in ukb_tp if 'brain' in col or 'eid' in col or 'time_point' in col]]\
[(ukb_tp_brain['time_point']=='2')]
#ukb_tp_brain.to_pickle('%s%s' % (path,'ukb_brain.p'))

(1507569, 1847)


In [98]:
ukb_tp0=pd.merge(ukb_tp0,death_codes,on='eid',how='left')

In [14]:
ukb_tp0_assess=ukb_tp0[['eid','age_when_attended_assessment_centre_f21003','date_of_attending_assessment_centre_f53']]
inpatient_update=pd.merge(ukb_tp0_assess,inpatient_update,left_on='eid',right_on='n_eid',how='left')

In [100]:
%%time
ukb_temp=pd.merge(ukb_tp0[['eid']],customfields,on='eid',how='left')

CPU times: user 234 ms, sys: 57.3 ms, total: 291 ms
Wall time: 479 ms


In [101]:
ukb_tp0['parental_ad_status']=ukb_temp['parental_ad_status']
ukb_tp0['APOE4_Carriers']=ukb_temp['APOE4_Carriers']
ukb_tp0['Genotype']=ukb_temp['Genotype']

In [34]:
ukb_tp0['Age_Today']=ukb_tp0['age_when_attended_assessment_centre_f21003']+\
(dt.datetime(year, month, day)-pd.to_datetime(ukb_tp0['date_of_attending_assessment_centre_f53'])).dt.days/365

In [35]:
ukb_tp0.to_pickle('%s%s' % (path,'ukb_tp0.p'))

In [8]:
ukb_tp0=pd.read_pickle('%s%s' % (path,'ukb_tp0.p'))

In [9]:
ukb_tp0_geno=ukb_tp0[['eid','Genotype']]
ukb_tp0_geno['APOE4s']=0
APOE_mask=(ukb_tp0_geno['Genotype']=="e3/e4")|(ukb_tp0_geno['Genotype']=="e4/e4")
ukb_tp0_geno['APOE4s'][APOE_mask]=1

In [127]:
%%time
diseases_bin=returndisease(inpatient_update)

CPU times: user 38min 22s, sys: 3min 13s, total: 41min 36s
Wall time: 40min 56s


In [109]:
%%time
diseases_bin_check=returndisease(inpatient_update.sample(20000))

CPU times: user 2min 36s, sys: 12.1 s, total: 2min 48s
Wall time: 2min 40s


In [111]:
diseases_bin_check['REM_DIS'].sum()

0

In [16]:
diseases_bin=pd.merge(diseases_bin,death_codes,left_on='n_eid',right_on='eid',how='left')

In [17]:
diseases_bin=pd.merge(diseases_bin,ukb_tp0_geno[['eid','APOE4s']],left_on='n_eid',right_on='eid')
APOEADmask=(diseases_bin['AD']==1)&(diseases_bin['APOE4s']==1)
diseases_bin['APOE_AD']=0
diseases_bin['APOE_AD'][APOEADmask]=1
diseases_bin['eid']=diseases_bin['n_eid']

In [18]:
def dis_before(var,days=100,df=diseases_bin):
    df['timebeforeAD'+var]=0
    mask1=(df['AD']==1)&(df[var]==1)
    df['timebeforeAD'+var][mask1]=\
    (pd.to_datetime(df['Diag_Date_AD'][mask1])-\
     pd.to_datetime(df['Diag_Date_'+var][mask1])).dt.days
    
    df[var+'_bef']=0
    df[var+'_bef'][(df['timebeforeAD'+var]<days)&(df[var]==1)]=1
    #print(var+df[var+'_bef'].value_counts())
    return df    

In [19]:
diseases_bin=dis_before('CERVASCALL',100,df=diseases_bin)
diseases_bin=dis_before('STROKE',100,df=diseases_bin)
diseases_bin=dis_before('EPIL',100,df=diseases_bin)
diseases_bin=dis_before('DIAB',100,df=diseases_bin)
diseases_bin=dis_before('CELIAC',100,df=diseases_bin)
diseases_bin=dis_before('CERVASC',100,df=diseases_bin)

In [36]:
diseases_bin['Age_Today']=diseases_bin['age_when_attended_assessment_centre_f21003']+\
(dt.datetime(2021, 1, 1)-pd.to_datetime(diseases_bin['date_of_attending_assessment_centre_f53'])).dt.days/365

In [43]:
mask_AD_date=((pd.to_datetime(diseases_bin['Diag_Date_AD'])-\
pd.to_datetime(diseases_bin['date_of_attending_assessment_centre_f53'])).dt.days/365>=2)|(diseases_bin['AD']==0)
diseases_bin=diseases_bin[mask_AD_date]

In [37]:
diseases_bin.to_pickle('%s%s' % (path,'ukb_disease_labels.p'))

In [52]:
diseases_bin['AD'].sum()

2147

In [58]:
diseases_bin['AD'][(diseases_bin['Age_Today']>=75)].sum()

1567