# UKB -- ALL CODES

In [None]:
import pandas as pd
import numpy as np

In [None]:
# #Download controls
# !dx download -r 'data/controls'

# #Download ICD10 codes
# !dx download -r 'data/ICD10_dates/F51'
# !dx download -r 'data/ICD10_dates/G47'

# # #Download NDD cases
# !dx download -r 'data/NDD_cases'

# # #Download UKB related file
# !dx download 'Bulk/Genotype Results/Genotype calls/ukb_rel.dat'

In [None]:
#Select NDD
ndd = 'ALS'

# Combine cases and controls

In [None]:
#Load cases
#cases = pd.read_csv(f'NDD_cases/AD_cases_n4447_SEPT_2023.csv')
#cases = pd.read_csv('NDD_cases/DEM_cases_n10043_SEPT_2023.csv')
#cases = pd.read_csv('NDD_cases/PD_cases_n4413_SEPT_2023.csv')
#cases = pd.read_csv('NDD_cases/VAS_cases_n2182_SEPT_2023.csv')
#cases = pd.read_csv('NDD_cases/MS_cases_n2595_SEPT_2023.csv')
cases = pd.read_csv('NDD_cases/ALS_cases_n753_SEPT_2023.csv')
cases

In [None]:
#Load controls
controls = pd.read_csv('controls/NDD_free_controls_45.csv')
#controls = pd.read_csv('controls/NDD_free_controls_60.csv')
controls[f'{ndd}_DATE'] = np.nan
controls = controls.rename(columns = {'eid':'ID', 'date_of_death':'DATE_OF_DEATH'})
controls = controls[['ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 'ETHNICITY',
       'AGE_OF_RECRUIT', f'{ndd}_DATE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
       'DATE_OF_DEATH']]
controls

In [None]:
# Combine cases and controls
df = pd.concat([cases, controls])

#Select only Caucasian samples
df = df[df['ETHNICITY'] == 'Caucasian']

#Remove cases without TOWNEND 
df = df[~df['TOWNSEND'].isna()]

#Check to make sure no duplicate IDs
print(df.ID.value_counts())

df

In [None]:
controls = df[df[f'{ndd}_DATE'].isna()]
controls.AGE_OF_RECRUIT.min()

In [None]:
#Check for NAs
print('Sex:', df.GENETIC_SEX.isna().value_counts())
print('Birth year:', df.BIRTH_YEAR.isna().value_counts())
print('AGE:', df.AGE_OF_RECRUIT.isna().value_counts())
print('TOWNSEND:', df.TOWNSEND.isna().value_counts())

In [None]:
#Drop PCs if not using
df = df[['ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 
       'AGE_OF_RECRUIT', f'{ndd}_DATE','DATE_OF_DEATH']]
df

In [None]:
#Check number of cases and controls
df[f'{ndd}_DATE'].isna().value_counts()

# Add ICD10 codes

In [None]:
icd10_list = ['F51', 'G47']

In [None]:
import glob

files = []
for code in icd10_list:
    files_list = glob.glob(f'{code}/*_with_date.csv')
    for i in files_list:
        files.append(i)
    
print(len(files))
print(files)

In [None]:
for file in files:
    code = pd.read_csv(f'{file}')
    df = df.merge(code, left_on = 'ID', right_on = 'ID', how = 'left')


In [None]:
df

# Prep for COX

In [None]:
#Set variables
STUDY_ENDS = '2023-09-30'
STUDY_START = '1999-01-01'

In [None]:
#Drop duplicates
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')
print(len(df))
df.head()

In [None]:
#Get list of codes
df_new = df.iloc[:, 7::]
codes = list(df_new.columns)
print(len(codes))
print(codes)

In [None]:
# Only select data from AFTER 1999
for code in codes:
    df[code] = np.where(pd.to_datetime(df[code]) < pd.to_datetime(STUDY_START), np.nan, df[code])

In [None]:
#Calculate the year they were recruited to the study
df['recruit_year'] = df['BIRTH_YEAR'] + df['AGE_OF_RECRUIT']

In [None]:
#Create tenure for people with NDD - select folks with an NDD date
has_NDD = df[~df[ndd + '_DATE'].isna()]

#Calculate the tenure, i.e. the time from the beginning of the study to their time of NDD diagnosis
has_NDD['tenure'] = (pd.to_datetime(has_NDD[ndd + '_DATE'], errors = 'coerce') - pd.to_datetime(has_NDD['recruit_year'], format='%Y')).dt.days/365

#Only keep people who got NDD after they joined study
has_NDD = has_NDD[has_NDD['tenure'] >= 0]

#Create tenure for people without NDD - select folks with no NDD date
NDD_free = df[df[ndd + '_DATE'].isna()]

#For people without NDD, break into dead and alive
alive = NDD_free[NDD_free['DATE_OF_DEATH'].isna()]
dead = NDD_free[~NDD_free['DATE_OF_DEATH'].isna()]

#Calculate the tenure for people who are still alive, i.e. the time from the beginning of the study to the end of study
alive['tenure'] = (pd.to_datetime(STUDY_ENDS) - pd.to_datetime(NDD_free['recruit_year'], format='%Y')).dt.days/365

#Calculate the tenure for people who are still dead, i.e. the time from the beginning of the study to the end of study
dead['tenure'] = (pd.to_datetime(dead['DATE_OF_DEATH']) - pd.to_datetime(NDD_free['recruit_year'], format='%Y')).dt.days/365

#Combine two groups
df = pd.concat([has_NDD, alive, dead])

In [None]:
#Encode NDD to 1 or 0
df[ndd] = np.where(df[ndd + '_DATE'].isna(), 0, 1)

#GENETIC_SEX to 1 or 2
df.loc[df.GENETIC_SEX == 'Female', 'GENETIC_SEX'] = '2'
df.loc[df.GENETIC_SEX == 'Male', 'GENETIC_SEX'] = '1'

In [None]:
for code in codes:
    df['Lag_' + code] = (pd.to_datetime(df[code]) - pd.to_datetime(df['recruit_year'], format = '%Y')).dt.days/365
        
    #Select data if it happened before study end -- lag 0
    df['QC0_' + code] = np.where((df['Lag_' + code] < df['tenure']), 1, 0)
    
    #Select data only <= 1 year before study end
    df['QC0-1_' + code] = np.where((df['tenure'] - df['Lag_' + code] <= 1) & (df['tenure'] - df['Lag_' + code] > 0), 1, 0)
    
    #Select data only 1-5 years before study end
    df['QC1-5_' + code] = np.where((df['tenure'] - df['Lag_' + code] > 1) & (df['tenure'] - df['Lag_' + code] <= 5), 1, 0)
    
    #Select data only 5-10 years before study end
    df['QC5-10_' + code] = np.where((df['tenure'] - df['Lag_' + code] > 5) & (df['tenure'] - df['Lag_' + code] <= 10), 1, 0)
    
    #Select data only 10+ years before study end
    df['QC10-15_' + code] = np.where((df['tenure'] - df['Lag_' + code] > 10) & (df['tenure'] - df['Lag_' + code] <= 15), 1, 0)
    
    #Select data only 5-15 years before study end
    df['QC5-15_' + code] = np.where((df['tenure'] - df['Lag_' + code] > 5) & (df['tenure'] - df['Lag_' + code] <= 15), 1, 0)

In [None]:
print(len(df))
df.head()

In [None]:
#Find related individuals
related = pd.read_csv('ukb_rel.dat', sep = ' ')
related = related[related['Kinship']> 0.0884]
related

In [None]:
#Create list of all people in our df
ndd_ids = list(df['ID'])
#ndd_ids = list(cases['ID'])

#Create list of people who are related & in our df
remove = related.loc[related['ID1'].isin(ndd_ids) & related['ID2'].isin(ndd_ids)]
remove

In [None]:
ID_remove = remove['ID1']
df_final = df[~df['ID'].isin(ID_remove)]
df_final

In [None]:
removed_df = df[df['ID'].isin(ID_remove)]
removed_df[f'{ndd}_DATE'].isna().value_counts()

In [None]:
df_final[f'{ndd}_DATE'].isna().value_counts()

In [None]:
df_final.to_csv(f'{ndd}_with_tenure_lags_45.csv', header = True, index = None)
#df.to_csv(f'{ndd}_with_tenure_lags.csv', header = True, index = None)

In [None]:
# ndd_list = ['AD', 'PD', 'DEM', 'VAS', 'MS', 'ALS']
# for ndd in ndd_list:
# #    print(f'!dx upload {ndd}_with_tenure_lags.csv --path /data/files_for_cox/{ndd}_with_tenure_lags.csv')
#     print(f'!dx upload {ndd}_with_tenure_lags_45.csv --path /data/files_for_cox/{ndd}_with_tenure_lags_45.csv')

In [None]:
!dx upload AD_with_tenure_lags_45.csv --path /data/files_for_cox/AD_with_tenure_lags_45.csv
!dx upload PD_with_tenure_lags_45.csv --path /data/files_for_cox/PD_with_tenure_lags_45.csv
!dx upload DEM_with_tenure_lags_45.csv --path /data/files_for_cox/DEM_with_tenure_lags_45.csv
!dx upload VAS_with_tenure_lags_45.csv --path /data/files_for_cox/VAS_with_tenure_lags_45.csv
!dx upload MS_with_tenure_lags_45.csv --path /data/files_for_cox/MS_with_tenure_lags_45.csv
!dx upload ALS_with_tenure_lags_45.csv --path /data/files_for_cox/ALS_with_tenure_lags_45.csv

In [None]:
#Print numbers for table 1 here
#ndd_list = ['AD']
ndd_list = ['AD', 'ALS', 'DEM', 'MS', 'PD', 'VAS']

for ndd in ndd_list:
    test = pd.read_csv(f'{ndd}_with_tenure_lags_45.csv')
    print(ndd)
    cases = test[~test[f'{ndd}_DATE'].isna()]
    controls = test[test[f'{ndd}_DATE'].isna()]
    print('Cases: ', len(cases))
    print('Controls: ', len(controls))
    female = cases[cases['GENETIC_SEX'] == 2]
    male = cases[cases['GENETIC_SEX'] == 1]
    print("Female: ", len(female))
    print("Male: ", len(male))
    print(len(female) + len(male))
    
    print("Controls by gender:")
    cases = controls
    female = cases[cases['GENETIC_SEX'] == 2]
    male = cases[cases['GENETIC_SEX'] == 1]
    print("Female: ", len(female))
    print("Male: ", len(male))
    print(len(female) + len(male))
    
    print('\n')