# Data Prep

In [None]:
!pip install statsmodels
!pip install -U seaborn
!pip install statsmodels
!pip install lifelines
!pip install scikit-learn

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import scipy
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

In [None]:
# ! dx download PD_interaction_analysis_AUG_21_2023.csv
# ! dx download AD_NO_APOE_interaction_analysis_AUG_21_2023.csv
# ! dx download AD_with_APOE_interaction_analysis_AUG_21_2023.csv
# ! dx download ALS_with_tenure.csv
# # ! dx download DEMENTIA_with_tenure.csv
# # ! dx download VASCULAR_with_tenure.csv

# PD

In [None]:
df_pd = pd.read_csv('PD_interaction_analysis_OCT_23_2023.csv')

## PD and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [None]:
ndd = 'PD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_pd).fit()
print(fitted.summary())


p = fitted.pvalues.loc['interactor_PD_F51']
print(p)

## PD and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [None]:
ndd = 'PD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_pd).fit()
print(fitted.summary())

list_terms = ['G47', 'interactor_PD_G47', 'Z_PD_PRS']
for i in list_terms:
    beta_coef  = fitted.params.loc[i]
    beta_se  = fitted.bse.loc[i]
    p_val = fitted.pvalues.loc[i]
    odds_ratio = np.exp(fitted.params.loc[i])
    conf = fitted.conf_int().loc[i]
    m5, m95 = np.exp(conf)
    print(i, odds_ratio, beta_coef, beta_se, m5, m95, p_val)

# AD without APOE

In [None]:
df_ad_no_apoe = pd.read_csv(f'AD_NO_APOE_interaction_analysis_OCT_23_2023.csv')

## AD (no APOE in PRS) and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_ad_no_apoe).fit()
print(fitted.summary())

## AD (no APOE in PRS) and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_ad_no_apoe).fit()
print(fitted.summary())

In [None]:
list_terms = ['G47', 'interactor_AD_G47', 'Z_AD_PRS']
for i in list_terms:
    p = fitted.pvalues.loc[i]
    print(p)

# AD with APOE

In [None]:
df_ad_with_apoe = pd.read_csv(f'AD_with_APOE_interaction_analysis_OCT_23_2023.csv')

## AD (includes APOE in PRS) and F51 Nonorganic sleep disorders (not due to a substance or known physiological condition)

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_ad_with_apoe).fit()
print(fitted.summary())

## AD (includes APOE in PRS) and G47 sleep disorders (includes sleep apnea and sleep related movement disorders)

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df_ad_with_apoe).fit()
print(fitted.summary())

list_terms = ['G47', 'interactor_AD_G47', 'Z_AD_PRS']
for i in list_terms:
    p = fitted.pvalues.loc[i]
    print(p)

# Age at Onset

In [None]:
#See what the model looks like when we switch to a linear model and use age at onset of PD or AD as the outcome

In [None]:
#Select only cases
pd_cases = df_pd[df_pd['PD']==1]
ad_cases_no_apoe = df_ad_no_apoe[df_ad_no_apoe['AD']==1]
ad_cases_with_apoe = df_ad_no_apoe[df_ad_with_apoe['AD']==1]

## PD

In [None]:
ndd = 'PD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=pd_cases).fit()
print(fitted.summary())

In [None]:
ndd = 'PD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=pd_cases).fit()
print(fitted.summary())

## AD without APOE in PRS

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_no_apoe).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_no_apoe).fit()
print(fitted.summary())

## AD cases with APOE in PRS

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_with_apoe).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'G47'

this_formula = 'AAO' + f"~ {variable} + interactor_{ndd}_{variable} + Z_{ndd}_PRS + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, data=ad_cases_with_apoe).fit()
print(fitted.summary())

# AD -- three groups of PRS -- with APOE¶

In [None]:
ndd = 'AD'
df = pd.read_csv(f'AD_with_APOE_interaction_analysis_AUG_21_2023.csv')
print(ndd)

#Find 1 standard deviation
prs_list = list(df[f'Z_{ndd}_PRS'])
a = np.std(prs_list)
print("1 SD:", a)
print('\t')

#Calculate low prs
low_prs = df[df[f'Z_{ndd}_PRS'] < -a]
print('low_prs')
print(low_prs[f'Z_{ndd}_PRS'].min())
print(low_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(low_prs))
print('\t')

#Calculate mid prs
mid_prs = df[df[f'Z_{ndd}_PRS'] <= a]
mid_prs = mid_prs[mid_prs[f'Z_{ndd}_PRS'] >= -a]
print('mid_prs')
print(mid_prs[f'Z_{ndd}_PRS'].min())
print(mid_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(mid_prs))
print('\t')

#Calculate high prs
high_prs = df[df[f'Z_{ndd}_PRS'] > a]
print('high_prs')
print(high_prs[f'Z_{ndd}_PRS'].min())
print(high_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(high_prs))

In [None]:
#LOW
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

In [None]:
test = low_prs[low_prs['AD'] == 1]
#test = test[test['F51'] == 1]
print(len(test))
test