# UKB MODELS

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
#%%bash
!dx download -r 'data/PRS/'
!dx download -r 'data/files_for_cox/'



In [2]:
#Set variables
ndd_list = ['AD']
ndd = 'AD'
STUDY_ENDS = '2023-09-30'
STUDY_START = '1999-01-01'

In [3]:
# Load PD prs file
prs = pd.read_csv(f'PRS/ad_prs_NO_APOE.profile', sep='\s+')
prs = prs.sort_values(by = 'FID')

# Eliminate people with negative IDs
prs = prs[prs['FID'] > 0]
prs

Unnamed: 0,FID,IID,PHENO,CNT,CNT2,SCORE
117,1000012,1000012,-9,44,10,0.002386
118,1000029,1000029,-9,40,11,-0.006834
119,1000031,1000031,-9,42,13,-0.002573
120,1000047,1000047,-9,42,9,0.001969
121,1000050,1000050,-9,44,10,-0.002024
...,...,...,...,...,...,...
487274,6026129,6026129,-9,44,12,-0.000491
487275,6026131,6026131,-9,44,9,-0.003746
487276,6026147,6026147,-9,44,5,-0.000483
487277,6026150,6026150,-9,44,15,-0.007895


In [4]:
#Load dataframe with covariates
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv', parse_dates = True)

#Select columns we need
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', 'AGE_OF_RECRUIT', ndd, 'QC0_F51_DATE','QC0_G47_DATE']]
df

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE_OF_RECRUIT,AD,QC0_F51_DATE,QC0_G47_DATE
0,3559505,2,-2.80,67,1,0,0
1,5426874,1,-4.78,68,1,0,0
2,1940961,1,-0.47,63,1,0,0
3,2893229,1,-2.66,59,1,0,0
4,5784514,2,-5.08,54,1,0,0
...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0
242049,3181859,1,-0.68,64,0,0,0
242050,5906295,1,5.84,61,0,0,0
242051,5866742,1,-1.32,62,0,0,0


In [5]:
# Merge with PRS profile file
df_prs = df.merge(prs, left_on = 'ID', right_on = 'IID', how = 'left')
df_prs = df_prs.rename(columns = {'AGE_OF_RECRUIT':'AGE'})
df_prs

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE,AD,QC0_F51_DATE,QC0_G47_DATE,FID,IID,PHENO,CNT,CNT2,SCORE
0,3559505,2,-2.80,67,1,0,0,3559505.0,3559505.0,-9.0,44.0,13.0,-0.001864
1,5426874,1,-4.78,68,1,0,0,5426874.0,5426874.0,-9.0,44.0,10.0,0.002704
2,1940961,1,-0.47,63,1,0,0,1940961.0,1940961.0,-9.0,44.0,17.0,-0.002273
3,2893229,1,-2.66,59,1,0,0,2893229.0,2893229.0,-9.0,44.0,13.0,0.000253
4,5784514,2,-5.08,54,1,0,0,5784514.0,5784514.0,-9.0,44.0,11.0,0.000933
...,...,...,...,...,...,...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0,5988964.0,5988964.0,-9.0,42.0,10.0,-0.000065
242049,3181859,1,-0.68,64,0,0,0,3181859.0,3181859.0,-9.0,44.0,13.0,0.001537
242050,5906295,1,5.84,61,0,0,0,5906295.0,5906295.0,-9.0,44.0,12.0,-0.001356
242051,5866742,1,-1.32,62,0,0,0,5866742.0,5866742.0,-9.0,40.0,10.0,-0.000623


In [7]:
! dx download 'pcs_for_all_participant.csv'



In [6]:
# Load pcs
pcs = pd.read_csv('pcs_for_all_participant.csv')
pcs = pcs.rename(columns = {'p22009_a1':'PC1', 'p22009_a2':'PC2', 'p22009_a3':'PC3', 'p22009_a4':'PC4', 'p22009_a5':'PC5'})
pcs

Unnamed: 0,eid,PC1,PC2,PC3,PC4,PC5,p22009_a6,p22009_a7,p22009_a8,p22009_a9,p22009_a10
0,1000047,-14.2128,6.22769,-1.539920,-0.328147,1.750350,0.688990,3.970100,-0.772619,-17.258900,4.757420
1,1000050,-10.8216,4.72615,1.505250,-3.343910,-6.009130,-3.404030,-0.805144,-0.570451,-1.410930,-1.767620
2,1000068,-12.1266,3.84736,-2.658120,3.699800,1.910890,-1.142440,1.005860,0.906277,1.334080,1.045590
3,1000122,-10.6169,1.39558,-2.085200,2.256820,-7.345710,-0.133245,0.028111,-2.565050,3.603530,1.528050
4,1000214,-15.0801,2.77297,-2.997140,2.178990,-2.893710,2.088460,0.942838,-2.566070,2.621270,2.122760
...,...,...,...,...,...,...,...,...,...,...,...
488122,6025982,-13.3840,5.40378,-1.768490,-0.853984,0.357245,2.013810,-1.360540,2.903770,-9.359030,-0.173365
488123,6026001,-14.2153,5.07890,-0.807003,-0.015238,-8.705450,0.315828,1.736990,-4.084850,3.644190,1.134860
488124,6026060,-11.7979,1.83159,0.496999,-2.285050,6.453050,1.491840,0.273326,-0.114140,0.159042,-2.615740
488125,6026087,-13.5424,1.33846,-1.547040,-0.045067,-3.827140,-1.323810,1.434850,1.296910,-2.224900,-1.588200


In [7]:
# Merge pcs with other covariates
df_prs = df_prs.merge(pcs, left_on = 'ID', right_on = 'eid', how = 'left')
df_prs

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE,AD,QC0_F51_DATE,QC0_G47_DATE,FID,IID,PHENO,...,PC1,PC2,PC3,PC4,PC5,p22009_a6,p22009_a7,p22009_a8,p22009_a9,p22009_a10
0,3559505,2,-2.80,67,1,0,0,3559505.0,3559505.0,-9.0,...,-12.3277,2.811810,-5.192830,6.021220,0.711214,4.562770,-3.892350,4.762520,-11.052100,1.936540
1,5426874,1,-4.78,68,1,0,0,5426874.0,5426874.0,-9.0,...,-15.3076,3.797150,-3.031950,3.952750,6.307610,-0.707205,-1.669040,0.497720,0.671753,-2.433960
2,1940961,1,-0.47,63,1,0,0,1940961.0,1940961.0,-9.0,...,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,1.792020,-1.900900,0.252297,-3.652680,0.514845
3,2893229,1,-2.66,59,1,0,0,2893229.0,2893229.0,-9.0,...,-14.0739,0.909499,-2.192330,2.253470,-8.586080,1.144570,-5.381140,-0.567268,-2.708350,2.125440
4,5784514,2,-5.08,54,1,0,0,5784514.0,5784514.0,-9.0,...,-12.6081,2.801040,0.067058,2.482140,1.149460,1.236620,-0.209564,-1.225420,0.543765,0.510201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0,5988964.0,5988964.0,-9.0,...,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,0.918957,-3.382970,-1.615420,-2.486830,-1.028870
242049,3181859,1,-0.68,64,0,0,0,3181859.0,3181859.0,-9.0,...,-13.2968,5.550250,-2.876440,2.725760,12.277300,1.044160,0.009065,3.409910,3.489030,2.141530
242050,5906295,1,5.84,61,0,0,0,5906295.0,5906295.0,-9.0,...,-12.6670,4.923010,-0.249682,5.808390,20.456700,0.714297,-0.045932,-0.219909,3.025360,0.409709
242051,5866742,1,-1.32,62,0,0,0,5866742.0,5866742.0,-9.0,...,-11.7895,4.665500,0.616765,-3.179100,-8.885110,-1.126600,1.424090,3.543140,-0.932735,-1.524050


In [8]:
#Select columns we actually need
df_prs = df_prs[['ID', 'GENETIC_SEX', 'TOWNSEND', 'AGE', ndd, 'QC0_F51_DATE',
       'QC0_G47_DATE', 'SCORE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']]
df_prs

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE,AD,QC0_F51_DATE,QC0_G47_DATE,SCORE,PC1,PC2,PC3,PC4,PC5
0,3559505,2,-2.80,67,1,0,0,-0.001864,-12.3277,2.811810,-5.192830,6.021220,0.711214
1,5426874,1,-4.78,68,1,0,0,0.002704,-15.3076,3.797150,-3.031950,3.952750,6.307610
2,1940961,1,-0.47,63,1,0,0,-0.002273,-12.6428,2.248000,-4.424400,-2.467130,-2.220740
3,2893229,1,-2.66,59,1,0,0,0.000253,-14.0739,0.909499,-2.192330,2.253470,-8.586080
4,5784514,2,-5.08,54,1,0,0,0.000933,-12.6081,2.801040,0.067058,2.482140,1.149460
...,...,...,...,...,...,...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0,-0.000065,-13.4550,4.312710,-0.708079,-0.585742,-8.072440
242049,3181859,1,-0.68,64,0,0,0,0.001537,-13.2968,5.550250,-2.876440,2.725760,12.277300
242050,5906295,1,5.84,61,0,0,0,-0.001356,-12.6670,4.923010,-0.249682,5.808390,20.456700
242051,5866742,1,-1.32,62,0,0,0,-0.000623,-11.7895,4.665500,0.616765,-3.179100,-8.885110


# Standardize

In [9]:
controls = df_prs[df_prs[ndd]==0]
mean = np.mean(controls['SCORE'])
sd = np.std(controls['SCORE'])

In [10]:
df_prs['Z_score'] = (df_prs['SCORE'] - mean)/sd

In [11]:
df_prs['Z_age'] = (df_prs['AGE'] - np.mean(df_prs['AGE']))/np.std(df_prs['AGE'])
df_prs['Z_PC1'] = (df_prs['PC1'] - np.mean(df_prs['PC1']))/np.std(df_prs['PC1'])
df_prs['Z_PC2'] = (df_prs['PC2'] - np.mean(df_prs['PC2']))/np.std(df_prs['PC2'])
df_prs['Z_PC3'] = (df_prs['PC3'] - np.mean(df_prs['PC3']))/np.std(df_prs['PC3'])
df_prs['Z_PC4'] = (df_prs['PC4'] - np.mean(df_prs['PC4']))/np.std(df_prs['PC4'])
df_prs['Z_PC5'] = (df_prs['PC5'] - np.mean(df_prs['PC5']))/np.std(df_prs['PC5'])

In [12]:
df_prs

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE,AD,QC0_F51_DATE,QC0_G47_DATE,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,Z_PC2,Z_PC3,Z_PC4,Z_PC5
0,3559505,2,-2.80,67,1,0,0,-0.001864,-12.3277,2.811810,-5.192830,6.021220,0.711214,-0.057379,1.302759,0.017380,-0.645318,-2.274266,1.586153,0.210944
1,5426874,1,-4.78,68,1,0,0,0.002704,-15.3076,3.797150,-3.031950,3.952750,6.307610,1.425655,1.447955,-1.831776,0.012349,-0.909258,0.886333,1.045509
2,1940961,1,-0.47,63,1,0,0,-0.002273,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,-0.190179,0.721973,-0.178153,-1.021635,-1.788856,-1.285690,-0.226286
3,2893229,1,-2.66,59,1,0,0,0.000253,-14.0739,0.909499,-2.192330,2.253470,-8.586080,0.629943,0.141187,-1.066212,-1.915021,-0.378878,0.311419,-1.175521
4,5784514,2,-5.08,54,1,0,0,0.000933,-12.6081,2.801040,0.067058,2.482140,1.149460,0.850472,-0.584796,-0.156621,-0.652507,1.048356,0.388785,0.276297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0,-0.000065,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,0.526676,1.302759,-0.682158,0.356461,0.558709,-0.649165,-1.098924
242049,3181859,1,-0.68,64,0,0,0,0.001537,-13.2968,5.550250,-2.876440,2.725760,12.277300,1.046598,0.867169,-0.583988,1.182460,-0.811024,0.471208,1.935743
242050,5906295,1,5.84,61,0,0,0,-0.001356,-12.6670,4.923010,-0.249682,5.808390,20.456700,0.107250,0.431580,-0.193171,0.763807,0.848275,1.514147,3.155501
242051,5866742,1,-1.32,62,0,0,0,-0.000623,-11.7895,4.665500,0.616765,-3.179100,-8.885110,0.345470,0.576776,0.351356,0.591932,1.395601,-1.526569,-1.220114


In [13]:
#Check to see if any samples don't have a Z score
df_prs[f'Z_score'].isna().value_counts()

False    241579
True        474
Name: Z_score, dtype: int64

In [14]:
#Eliminate samples without a Z_score
df_prs = df_prs[~df_prs[f'Z_score'].isna()]
df_prs

Unnamed: 0,ID,GENETIC_SEX,TOWNSEND,AGE,AD,QC0_F51_DATE,QC0_G47_DATE,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,Z_PC2,Z_PC3,Z_PC4,Z_PC5
0,3559505,2,-2.80,67,1,0,0,-0.001782,-12.3277,2.811810,-5.192830,6.021220,0.711214,-0.538542,1.302759,0.017380,-0.645318,-2.274266,1.586153,0.210944
1,5426874,1,-4.78,68,1,0,0,0.013916,-15.3076,3.797150,-3.031950,3.952750,6.307610,1.951025,1.447955,-1.831776,0.012349,-0.909258,0.886333,1.045509
2,1940961,1,-0.47,63,1,0,0,-0.002174,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,-0.600591,0.721973,-0.178153,-1.021635,-1.788856,-1.285690,-0.226286
3,2893229,1,-2.66,59,1,0,0,0.000242,-14.0739,0.909499,-2.192330,2.253470,-8.586080,-0.217409,0.141187,-1.066212,-1.915021,-0.378878,0.311419,-1.175521
4,5784514,2,-5.08,54,1,0,0,0.012221,-12.6081,2.801040,0.067058,2.482140,1.149460,1.682297,-0.584796,-0.156621,-0.652507,1.048356,0.388785,0.276297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242048,5988964,2,-2.90,67,0,0,0,-0.000062,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,-0.265658,1.302759,-0.682158,0.356461,0.558709,-0.649165,-1.098924
242049,3181859,1,-0.68,64,0,0,0,0.001470,-13.2968,5.550250,-2.876440,2.725760,12.277300,-0.022737,0.867169,-0.583988,1.182460,-0.811024,0.471208,1.935743
242050,5906295,1,5.84,61,0,0,0,0.010032,-12.6670,4.923010,-0.249682,5.808390,20.456700,1.335036,0.431580,-0.193171,0.763807,0.848275,1.514147,3.155501
242051,5866742,1,-1.32,62,0,0,0,-0.000596,-11.7895,4.665500,0.616765,-3.179100,-8.885110,-0.350322,0.576776,0.351356,0.591932,1.395601,-1.526569,-1.220114


In [15]:
# Add extra columns for graphs
df_prs['status'] = np.where(df_prs[ndd] == 0, 'control', 'case')
df_prs['F51 Disorder'] = np.where(df_prs['QC0_F51_DATE'] == 0, 'no', 'yes')
df_prs['G47 Disorder'] = np.where(df_prs['QC0_G47_DATE'] == 0, 'no', 'yes')

In [16]:
#df_prs.to_csv(f'{ndd}_with_Z_score_for_graphs_april_30.csv', header = True, index = None)
df_prs.to_csv(f'{ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv', header = True, index = None)

In [17]:
!dx upload {ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv --path /data/PRS/{ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv

ID                          file-GjvJJz8Jq9vb073Gzv9bbzqZ
Class                       file
Project                     project-GZBqBx8Jq9vpQ6729F24BjYX
Folder                      /data/PRS
Name                        AD_with_Z_score_for_graphs_april_30.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon May  6 13:11:25 2024
Created by                  klevine22
 via the job                job-GjvGkgQJq9vpjxfg7PQ7KQbv
Last modified               Mon May  6 13:11:27 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
