# UKB MODELS

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection
from lifelines import CoxPHFitter

import warnings
warnings.filterwarnings("ignore")

In [None]:
#%%bash
#dx download -r 'intermediate_files/'
#dx download 'data/codes/kristin_field_ids_UKB.csv'
#dx download 'data/codes/sleep_IDs_all_MAY_19_2023.csv'
!dx download -r 'data/PRS/'

In [None]:
#Set variables
ndd_list = ['AD']
ndd = 'AD'
STUDY_ENDS = '2023-09-30'
STUDY_START = '1999-01-01'

In [None]:
prs_pd = pd.read_csv(f'PRS/ad_prs_NO_APOE.profile', sep='\s+')
#prs_pd = pd.read_csv(f'PRS/pd_prs.profile', sep='\s+')
#prs_pd = pd.read_csv(f'PRS/ad_prs.profile', sep='\s+')
prs_pd = prs_pd.sort_values(by = 'FID')
prs_pd

In [None]:
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv', parse_dates = True)
df.head()

In [None]:
df = df[['ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 'AGE_OF_RECRUIT',
       f'{ndd}_DATE', 'DATE_OF_DEATH', 'F51_DATE', 'G47_DATE', 'recruit_year', 'tenure', ndd, 'Lag_F51_DATE', 'QC0_F51_DATE','Lag_G47_DATE', 'QC0_G47_DATE']]
         
df       

In [None]:
df_prs = df.merge(prs_pd, left_on = 'ID', right_on = 'IID', how = 'left')
df_prs

In [None]:
#Calculate mean of controls
controls = df_prs[df_prs[f'{ndd}_DATE'].isna()]
mean_controls = controls['SCORE'].mean()
print("Control mean:", mean_controls)

#Calculate std of controls
std_controls = controls['SCORE'].std()
print("Control std:", std_controls)

df_prs[f'Z_{ndd}_PRS'] = (df_prs['SCORE'] - mean_controls)/std_controls

print(ndd)
print(df_prs[f'Z_{ndd}_PRS'].min())
print(df_prs[f'Z_{ndd}_PRS'].max())

In [None]:
df_prs[f'Z_{ndd}_PRS'].isna().value_counts()

In [None]:
df_prs = df_prs[~df_prs[f'Z_{ndd}_PRS'].isna()]
df_prs

In [None]:
df_prs.columns

In [None]:
df_prs['status'] = np.where(df_prs[ndd] == 0, 'control', 'case')
df_prs['F51 Disorder'] = np.where(df_prs['QC0_F51_DATE'] == 0, 'no', 'yes')
df_prs['G47 Disorder'] = np.where(df_prs['QC0_G47_DATE'] == 0, 'no', 'yes')
df_prs['F51 Disorder'].value_counts()

In [None]:
df_prs

In [None]:
df_prs.to_csv(f'{ndd}_with_PRS_NO_APOE_for_graphs.csv', header = True, index = None)

In [None]:
!dx upload {ndd}_with_PRS_NO_APOE_for_graphs.csv --path /data/PRS/{ndd}_with_PRS_NO_APOE_for_graphs.csv