# Data Prep

In [None]:
# !pip install statsmodels
# !pip install -U seaborn
# !pip install statsmodels
# !pip install lifelines
# !pip install scikit-learn

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import scipy
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

In [None]:
# ! dx download PD_interaction_analysis_AUG_21_2023.csv
# ! dx download AD_NO_APOE_interaction_analysis_AUG_21_2023.csv
# ! dx download AD_with_APOE_interaction_analysis_AUG_21_2023.csv
# ! dx download ALS_with_tenure.csv
# # ! dx download DEMENTIA_with_tenure.csv
# # ! dx download VASCULAR_with_tenure.csv

# PD

In [2]:
df_pd = pd.read_csv('PD_interaction_analysis_april_30.csv')
df_pd

Unnamed: 0,ID,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,...,AGE_OF_RECRUIT,BIRTH_YEAR,tenure,PD_DATE,PD,F51,G47,AAO,interactor_PD_F51,interactor_PD_G47
0,5986755,-0.005752,-13.3959,5.89336,-2.398670,-0.052244,0.017704,0.818861,0.292830,-0.646379,...,60,1945,0.498630,2005-07-02,1,0,0,60.539726,0.818861,0.818861
1,5091212,-0.012065,-10.3817,3.22082,-0.419226,3.640730,-6.380780,-1.125486,1.310861,1.223686,...,67,1939,0.391781,2006-05-24,1,0,0,67.438356,-1.125486,-1.125486
2,5164402,-0.006186,-11.7090,2.20405,-2.077410,3.828630,-5.312990,0.685058,0.729129,0.400205,...,63,1943,0.498630,2006-07-02,1,0,0,63.542466,0.685058,0.685058
3,1942953,-0.011824,-13.5989,2.63708,-5.808340,9.143290,11.225900,-1.051476,-0.288902,-0.772324,...,56,1951,0.191781,2007-03-12,1,0,0,56.230137,-1.051476,-1.051476
4,2548358,-0.004134,-12.9616,4.96654,-2.865700,3.600200,2.566680,1.317089,1.456294,-0.376931,...,68,1939,0.315068,2007-04-26,1,0,0,68.361644,1.317089,1.317089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240905,5988964,-0.008323,-13.4550,4.31271,-0.708079,-0.585742,-8.072440,0.026795,1.310861,-0.683045,...,67,1941,12.419178,,0,0,0,,0.026795,0.026795
240906,3181859,-0.008959,-13.2968,5.55025,-2.876440,2.725760,12.277300,-0.169099,0.874562,-0.584895,...,64,1943,7.550685,,0,0,0,,-0.169099,-0.169099
240907,5906295,-0.003869,-12.6670,4.92301,-0.249682,5.808390,20.456700,1.398833,0.438263,-0.194156,...,61,1945,2.342466,,0,0,0,,1.398833,1.398833
240908,5866742,-0.005556,-11.7895,4.66550,0.616765,-3.179100,-8.885110,0.878974,0.583696,0.350261,...,62,1946,13.816438,,0,0,0,,0.878974,0.878974


## PD and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [3]:
ndd = 'PD'
variable = 'F51'
model = f'{variable} and {ndd} PRS interaction'
data = df_pd

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output1 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output1

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,F51 and PD PRS interaction,F51,2.378919,0.866646,0.556163,-0.223413,1.956706,1.558259,0.119172
1,F51 and PD PRS interaction,interactor_PD_F51,0.232162,-1.460319,0.421048,-2.285557,-0.63508,-3.468296,0.000524
2,F51 and PD PRS interaction,Z_score,5.905864,1.775946,0.422267,0.948317,2.603575,4.205737,2.6e-05


## PD and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [4]:
ndd = 'PD'
variable = 'G47'
model = f'{variable} and {ndd} PRS interaction'
data = df_pd

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output2 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output2

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,G47 and PD PRS interaction,G47,1.496206,0.402932,0.091465,0.223665,0.5822,4.405339,1.056186e-05
1,G47 and PD PRS interaction,interactor_PD_G47,0.72922,-0.31578,0.091725,-0.495559,-0.136002,-3.44268,0.0005759813
2,G47 and PD PRS interaction,Z_score,1.898869,0.641258,0.097379,0.4504,0.832117,6.585214,4.542302e-11


# AD without APOE

In [5]:
df_ad_no_apoe = pd.read_csv(f'AD_NO_APOE_interaction_analysis_april_30.csv')
df_ad_no_apoe

Unnamed: 0,ID,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,...,AGE_OF_RECRUIT,BIRTH_YEAR,tenure,AD_DATE,AD,F51,G47,AAO,interactor_AD_F51,interactor_AD_G47
0,3559505,-0.001864,-12.3277,2.811810,-5.192830,6.021220,0.711214,-0.057379,1.302759,0.017380,...,67,1940,0.372603,2007-05-17,1,0,0,67.419178,-0.057379,-0.057379
1,5426874,0.002704,-15.3076,3.797150,-3.031950,3.952750,6.307610,1.425655,1.447955,-1.831776,...,68,1939,0.800000,2007-10-20,1,0,0,68.846575,1.425655,1.425655
2,1940961,-0.002273,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,-0.190179,0.721973,-0.178153,...,63,1944,1.498630,2008-07-01,1,0,0,64.542466,-0.190179,-0.190179
3,2893229,0.000253,-14.0739,0.909499,-2.192330,2.253470,-8.586080,0.629943,0.141187,-1.066212,...,59,1948,2.060274,2009-01-22,1,0,0,61.101370,0.629943,0.629943
4,5784514,0.000933,-12.6081,2.801040,0.067058,2.482140,1.149460,0.850472,-0.584796,-0.156621,...,54,1954,1.063014,2009-01-23,1,0,0,55.098630,0.850472,0.850472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241574,5988964,-0.000065,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,0.526676,1.302759,-0.682158,...,67,1941,12.419178,,0,0,0,,0.526676,0.526676
241575,3181859,0.001537,-13.2968,5.550250,-2.876440,2.725760,12.277300,1.046598,0.867169,-0.583988,...,64,1943,7.550685,,0,0,0,,1.046598,1.046598
241576,5906295,-0.001356,-12.6670,4.923010,-0.249682,5.808390,20.456700,0.107250,0.431580,-0.193171,...,61,1945,2.342466,,0,0,0,,0.107250,0.107250
241577,5866742,-0.000623,-11.7895,4.665500,0.616765,-3.179100,-8.885110,0.345470,0.576776,0.351356,...,62,1946,13.816438,,0,0,0,,0.345470,0.345470


## AD (no APOE in PRS) and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [6]:
ndd = 'AD'
variable = 'F51'
model = f'{variable} and {ndd} PRS interaction (excluding APOE4)'
data = df_ad_no_apoe

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output3 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output3

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,F51 and AD PRS interaction (excluding APOE4),F51,1.350771,0.300675,0.594393,-0.864314,1.465665,0.505852,0.61296
1,F51 and AD PRS interaction (excluding APOE4),interactor_AD_F51,0.676476,-0.390859,0.657008,-1.678571,0.896854,-0.594907,0.551906
2,F51 and AD PRS interaction (excluding APOE4),Z_score,2.008889,0.697582,0.657652,-0.591393,1.986557,1.060715,0.288819


## AD (no APOE in PRS) and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [7]:
ndd = 'AD'
variable = 'G47'
model = f'{variable} and {ndd} PRS interaction (excluding APOE4)'
data = df_ad_no_apoe

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output4 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output4

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,G47 and AD PRS interaction (excluding APOE4),G47,1.342378,0.294443,0.091229,0.115637,0.473248,3.227513,0.001249
1,G47 and AD PRS interaction (excluding APOE4),interactor_AD_G47,0.931453,-0.071009,0.089043,-0.245531,0.103512,-0.797469,0.425178
2,G47 and AD PRS interaction (excluding APOE4),Z_score,1.462728,0.380303,0.093845,0.19637,0.564236,4.05246,5.1e-05


# AD with APOE

In [8]:
df_ad_with_apoe = pd.read_csv(f'AD_with_APOE_interaction_analysis_april_30.csv')

## AD (includes APOE in PRS) and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [9]:
ndd = 'AD'
variable = 'F51'
model = f'{variable} and {ndd} PRS interaction (with APOE4)'
data = df_ad_with_apoe

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output5 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output5

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,F51 and AD PRS interaction (with APOE4),F51,1.807434,0.591908,0.596347,-0.576911,1.760727,0.992557,0.320926
1,F51 and AD PRS interaction (with APOE4),interactor_AD_F51,0.445399,-0.808784,0.598273,-1.981378,0.36381,-1.351863,0.176419
2,F51 and AD PRS interaction (with APOE4),Z_score,4.846654,1.578289,0.598754,0.404752,2.751825,2.635954,0.00839


## AD (includes APOE in PRS) and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [10]:
ndd = 'AD'
variable = 'G47'
model = f'{variable} and {ndd} PRS interaction (with APOE4)'
data = df_ad_with_apoe

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=data).fit()
#print(fitted.summary())

list_terms = [f'{variable}', f'interactor_{ndd}_{variable}', f'Z_score']
results = []
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    z_val = beta_coef/beta_se
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    #m5, m95 = np.exp(conf)
    m5, m95 = conf
    #print(model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val)
    results.append((model, i, odds_ratio, beta_coef, beta_se, m5, m95, z_val, p_val))
output6 = pd.DataFrame(results, columns=('Model', 'Parameter','OR', 'Beta','SE', '95% CI low', "95% CI high", 'z', "P-value"))
output6

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,G47 and AD PRS interaction (with APOE4),G47,1.517749,0.417228,0.104249,0.212904,0.621552,4.002231,6.274789e-05
1,G47 and AD PRS interaction (with APOE4),interactor_AD_G47,0.894918,-0.111024,0.074902,-0.257829,0.035782,-1.482251,0.1382736
2,G47 and AD PRS interaction (with APOE4),Z_score,2.422058,0.884617,0.078748,0.730274,1.038961,11.233515,2.791488e-29


In [12]:
# Final output
final_output = pd.concat([output1, output2, output3, output4, output5, output6])
final_output

Unnamed: 0,Model,Parameter,OR,Beta,SE,95% CI low,95% CI high,z,P-value
0,F51 and PD PRS interaction,F51,2.378919,0.866646,0.556163,-0.223413,1.956706,1.558259,0.1191718
1,F51 and PD PRS interaction,interactor_PD_F51,0.232162,-1.460319,0.421048,-2.285557,-0.63508,-3.468296,0.0005237699
2,F51 and PD PRS interaction,Z_score,5.905864,1.775946,0.422267,0.948317,2.603575,4.205737,2.602328e-05
0,G47 and PD PRS interaction,G47,1.496206,0.402932,0.091465,0.223665,0.5822,4.405339,1.056186e-05
1,G47 and PD PRS interaction,interactor_PD_G47,0.72922,-0.31578,0.091725,-0.495559,-0.136002,-3.44268,0.0005759813
2,G47 and PD PRS interaction,Z_score,1.898869,0.641258,0.097379,0.4504,0.832117,6.585214,4.542302e-11
0,F51 and AD PRS interaction (excluding APOE4),F51,1.350771,0.300675,0.594393,-0.864314,1.465665,0.505852,0.6129603
1,F51 and AD PRS interaction (excluding APOE4),interactor_AD_F51,0.676476,-0.390859,0.657008,-1.678571,0.896854,-0.594907,0.5519056
2,F51 and AD PRS interaction (excluding APOE4),Z_score,2.008889,0.697582,0.657652,-0.591393,1.986557,1.060715,0.2888193
0,G47 and AD PRS interaction (excluding APOE4),G47,1.342378,0.294443,0.091229,0.115637,0.473248,3.227513,0.001248713


In [15]:
final_output.to_csv('final_PRS_sleep_updated_interaction_may_2024.csv', header = True, index = False)

In [16]:
! dx upload final_PRS_sleep_updated_interaction_may_2024.csv --path /data/interaction/final_PRS_sleep_updated_interaction_may_2024.csv

ID                          file-GjvKvk0Jq9vbfy2XX5X6q7yf
Class                       file
Project                     project-GZBqBx8Jq9vpQ6729F24BjYX
Folder                      /data/interaction
Name                        final_PRS_sleep_updated_interaction_may_2024.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon May  6 14:51:16 2024
Created by                  klevine22
 via the job                job-GjvGkgQJq9vpjxfg7PQ7KQbv
Last modified               Mon May  6 14:51:17 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


# Age at Onset

In [None]:
#See what the model looks like when we switch to a linear model and use age at onset of PD or AD as the outcome

In [None]:
#Select only cases
pd_cases = df_pd[df_pd['PD']==1]
ad_cases_no_apoe = df_ad_no_apoe[df_ad_no_apoe['AD']==1]
ad_cases_with_apoe = df_ad_no_apoe[df_ad_with_apoe['AD']==1]

## PD

In [None]:
ndd = 'PD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=pd_cases).fit()
print(fitted.summary())

In [None]:
ndd = 'PD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=pd_cases).fit()
print(fitted.summary())

## AD without APOE in PRS

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_no_apoe).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_no_apoe).fit()
print(fitted.summary())

## AD cases with APOE in PRS

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_with_apoe).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_score + Z_age + GENETIC_SEX + TOWNSEND + Z_PC1 +Z_PC2 +Z_PC3 +Z_PC4 +Z_PC5"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_with_apoe).fit()
print(fitted.summary())