# UKB MODELS

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
#%%bash
!dx download -r 'data/PRS/'
!dx download -r 'data/files_for_cox/'

In [None]:
#Set variables
ndd_list = ['AD']
ndd = 'AD'
STUDY_ENDS = '2023-09-30'
STUDY_START = '1999-01-01'

In [None]:
# Load PD prs file
prs = pd.read_csv(f'PRS/ad_prs_NO_APOE.profile', sep='\s+')
prs = prs.sort_values(by = 'FID')

# Eliminate people with negative IDs
prs = prs[prs['FID'] > 0]
prs

In [None]:
#Load dataframe with covariates
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv', parse_dates = True)

#Select columns we need
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', 'AGE_OF_RECRUIT', ndd, 'QC0_F51_DATE','QC0_G47_DATE']]
df

In [None]:
# Merge with PRS profile file
df_prs = df.merge(prs, left_on = 'ID', right_on = 'IID', how = 'left')
df_prs = df_prs.rename(columns = {'AGE_OF_RECRUIT':'AGE'})
df_prs

In [None]:
! dx download 'pcs_for_all_participant.csv'

In [None]:
# Load pcs
pcs = pd.read_csv('pcs_for_all_participant.csv')
pcs = pcs.rename(columns = {'p22009_a1':'PC1', 'p22009_a2':'PC2', 'p22009_a3':'PC3', 'p22009_a4':'PC4', 'p22009_a5':'PC5'})
pcs

In [None]:
# Merge pcs with other covariates
df_prs = df_prs.merge(pcs, left_on = 'ID', right_on = 'eid', how = 'left')
df_prs

In [None]:
#Select columns we actually need
df_prs = df_prs[['ID', 'GENETIC_SEX', 'TOWNSEND', 'AGE', ndd, 'QC0_F51_DATE',
       'QC0_G47_DATE', 'SCORE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']]
df_prs

# Standardize

In [None]:
controls = df_prs[df_prs[ndd]==0]
mean = np.mean(controls['SCORE'])
sd = np.std(controls['SCORE'])

In [None]:
df_prs['Z_score'] = (df_prs['SCORE'] - mean)/sd

In [None]:
df_prs['Z_age'] = (df_prs['AGE'] - np.mean(df_prs['AGE']))/np.std(df_prs['AGE'])
df_prs['Z_PC1'] = (df_prs['PC1'] - np.mean(df_prs['PC1']))/np.std(df_prs['PC1'])
df_prs['Z_PC2'] = (df_prs['PC2'] - np.mean(df_prs['PC2']))/np.std(df_prs['PC2'])
df_prs['Z_PC3'] = (df_prs['PC3'] - np.mean(df_prs['PC3']))/np.std(df_prs['PC3'])
df_prs['Z_PC4'] = (df_prs['PC4'] - np.mean(df_prs['PC4']))/np.std(df_prs['PC4'])
df_prs['Z_PC5'] = (df_prs['PC5'] - np.mean(df_prs['PC5']))/np.std(df_prs['PC5'])

In [None]:
df_prs

In [None]:
#Check to see if any samples don't have a Z score
df_prs[f'Z_score'].isna().value_counts()

In [None]:
#Eliminate samples without a Z_score
df_prs = df_prs[~df_prs[f'Z_score'].isna()]
df_prs

In [None]:
# Add extra columns for graphs
df_prs['status'] = np.where(df_prs[ndd] == 0, 'control', 'case')
df_prs['F51 Disorder'] = np.where(df_prs['QC0_F51_DATE'] == 0, 'no', 'yes')
df_prs['G47 Disorder'] = np.where(df_prs['QC0_G47_DATE'] == 0, 'no', 'yes')

In [None]:
#df_prs.to_csv(f'{ndd}_with_Z_score_for_graphs_april_30.csv', header = True, index = None)
df_prs.to_csv(f'{ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv', header = True, index = None)

In [None]:
!dx upload {ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv --path /data/PRS/{ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv