In [1]:
import pandas as pd
import numpy as np
import re
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
path="../../../ukb/data/"

In [2]:
year=2021
month=1
day=1

In [3]:

conditions=['AD','PD','FTD','EPIL','DIAB','COVID','CERVASC','STROKE','CERVASCALL','CELIAC','SLEEP_DIS','AUD']
icd10dis={
    'AD': "F00|F000|F001|F002|F009|G30|G300|G301|G308|G309",
    'PD': "G20",
    'FTD':"G31",
    'EPIL':"G40",
    'DIAB':"E10|E11|E13|E14",
    #'DIAB':"E08|E09|E10|E11|E13",
    'COVID':"U07",
    'CERVASC':"I67", #show I63-I69
    'STROKE':"I63", #show I63-I69
    'CERVASCALL':"I60|I61|I62|I63|I64|I65|I66|I67|I68|I69",
    'CELIAC':"K90",
    'SLEEP_DIS':'G47',
    'AUD':'F101|F102'
    
}

icd9dis={
    'AD': "F00|F000|F001|F002|F009|G30|G300|G301|G308|G309",
    'PD': "G20",
    'FTD':"G31",
    'EPIL':"G40",
    'DIAB':"E11",
    'COVID':"U07",
    'CERVASC':"I67",
    'CELIAC':"579",
    'REM_DIS':'G47.52'
}

In [14]:
def read_data(file,path=path,chunksize=10000):
    df_chunk = pd.read_csv('%s%s' % (path,file), chunksize=chunksize, iterator=True,sep=',')#,nrows=1000)
    df = pd.concat(df_chunk, ignore_index=True)
    return df

def returndata(df,tp,minvals=50000):
    df=df[(df['time_point']==tp)]
    cols=[col for col in df.columns if df[col].count()>minvals]
    df=df[cols]
    return df

def returndisease(df,conditions=conditions,icd10dis=icd10dis):
    
    conditions_diag_date=['Diag_Date_'+disease for disease in conditions]
    conditions_age_diag=['Age_Diag_'+disease for disease in conditions]
    conditions_years_bef=['years_bef_diag'+disease for disease in conditions]
    for i,disease in enumerate(conditions):
        df[disease]=0
    
        disease_num=disease+'_num'
        disease_diag_date=conditions_diag_date[i]
        disease_diag_age=conditions_age_diag[i]
        disease_years_bef=conditions_years_bef[i]
        df[disease_diag_date]=pd.NaT
        df[disease_diag_age]=0
        df[disease_years_bef]=0
        df[disease_num]='none'
        
        #print(df.columns)
        for s in df.columns:
            if s!="n_eid" and '41270' in s:
                uscloc=[j for j,s1 in enumerate(s) if s1=='_']
                mask=(df[s].str.contains(icd10dis[disease]))&pd.notnull(df[s])

                df[disease][mask]=1 
                number='num'+str(s[uscloc[len(uscloc)-1]+1:len(s)])
                df[disease_num][mask]=number

        for i,s in enumerate(df.columns):
            if s!="n_eid" and '41280' in s:
                uscloc=[j for j,s1 in enumerate(s) if s1=='_']
                number2='num'+str(s[uscloc[len(uscloc)-1]+1:len(s)])
                mask_num=(df[disease_num]==number2)
                df[disease_diag_date][mask_num]=df[s].astype(str).apply(lambda x:x[2:12])
                
                try:
                    df[disease_diag_age][(pd.notnull(df[disease_diag_date]))]=\
                    df['age_when_attended_assessment_centre_f21003']+(pd.to_datetime(df[disease_diag_date])-\
                    pd.to_datetime(df['date_of_attending_assessment_centre_f53'])).dt.days/365
                
                except:
                    pass
                             
                try:
                    df[disease_years_bef]=(pd.to_datetime(df[disease_diag_date])-\
                    pd.to_datetime(df['date_of_attending_assessment_centre_f53'])).dt.days/365
                except:
                    pass

    df_out=df[np.concatenate([['n_eid','date_of_attending_assessment_centre_f53','age_when_attended_assessment_centre_f21003'],
                              conditions,conditions_diag_date,conditions_age_diag,conditions_years_bef])]
    
    return df_out

def dis_before(var,df,days=100,depvar='AD'):
    df['timebefore'+depvar+var]=0
    mask1=(df[depvar]==1)&(df[var]==1)
    df['timebefore'+depvar+var][mask1]=\
    (pd.to_datetime(df['Diag_Date_'+depvar][mask1])-\
     pd.to_datetime(df['Diag_Date_'+var][mask1])).dt.days
    
    df[var+'_bef']=0
    df[var+'_bef'][(df['timebefore'+depvar+var]<days)&(df[var]==1)]=1
    #print(var+df[var+'_bef'].value_counts())
    return df

In [7]:
inpatient_update=read_data("inpatient_update.csv")
death_codes=read_data("hm_cod_update_02sep2020.csv")

In [8]:
ukb_tp0=pd.read_pickle('%s%s' % (path,'ukb_tp0_.p'))

In [9]:
death_codes=death_codes[['n_eid','s_40000_0_0']]
death_codes['death']=1
death_codes.columns=['eid','death_date','death']

In [10]:
ukb_tp0_assess=ukb_tp0[['eid','age_when_attended_assessment_centre_f21003','date_of_attending_assessment_centre_f53']]
inpatient_update=pd.merge(ukb_tp0_assess,inpatient_update,left_on='eid',right_on='n_eid',how='left')

In [11]:
ukb_tp0['Age_Today']=ukb_tp0['age_when_attended_assessment_centre_f21003']+\
(dt.datetime(year, month, day)-pd.to_datetime(ukb_tp0['date_of_attending_assessment_centre_f53'])).dt.days/365

In [12]:
diseases_bin=returndisease(inpatient_update)

In [13]:
diseases_bin=pd.merge(diseases_bin,death_codes,left_on='n_eid',right_on='eid',how='left')

In [18]:
diseases_bin=dis_before('CERVASCALL',diseases_bin,100)
diseases_bin=dis_before('STROKE',diseases_bin,100)
diseases_bin=dis_before('EPIL',diseases_bin,100)
diseases_bin=dis_before('DIAB',diseases_bin,100)
diseases_bin=dis_before('CELIAC',diseases_bin,100)
diseases_bin=dis_before('CERVASC',diseases_bin,100)
diseases_bin=dis_before('AUD',diseases_bin,100)

In [19]:
diseases_bin['Age_Today']=diseases_bin['age_when_attended_assessment_centre_f21003']+\
(dt.datetime(year, month, day)-pd.to_datetime(diseases_bin['date_of_attending_assessment_centre_f53'])).dt.days/365

In [20]:
diseases_bin.to_pickle('%s%s' % (path,'ukb_disease_labels_.p'))