# Data Prep

In [None]:
!pip install statsmodels
!pip install -U seaborn
!pip install statsmodels
!pip install lifelines
!pip install scikit-learn

In [None]:
#!dx download -r 'data/files_for_cox'

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import scipy
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

In [None]:
test = pd.read_csv(f'files_for_cox/AD_with_tenure_lags_45.csv')
test.columns

# Kaplan-Meyer -- codes that were sig and replicated in Cox model

In [None]:
ndd_list = ['AD', 'DEM', 'PD', 'VAS']
final_df = pd.DataFrame()
for ndd in ndd_list:
    df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')
    df['condition'] = ndd
    df['NDD'] = df[ndd]
    df['F51'] = df['QC0_F51_DATE']
    df['G47'] = df['QC0_G47_DATE']
    df['G47.3'] = df['QC0_G47.3_DATE']

    high_tenure = df[df['tenure'] > 15]
    high_tenure['TENURE'] = 15
    high_tenure

    low_tenure = df[df['tenure'] <= 15]
    low_tenure['TENURE'] = low_tenure['tenure']
    low_tenure

    final_df = pd.concat([final_df, low_tenure, high_tenure])
    final_df = final_df[['condition', 'NDD', 'F51', 'G47', 'G47.3', 'TENURE']]

In [None]:
final_df = final_df.replace('VAS', 'VASCULAR')
final_df = final_df.replace('DEM', 'DEMENTIA')
final_df

In [None]:
#ndd_list = ['AD', 'DEMENTIA', 'DEMENTIA', 'DEMENTIA', 'PD', 'VASCULAR']
#sig_list = ['G47', 'F51', 'G47', 'G47.3', 'G47', 'G47.3']
ndd_list = ['PD']
sig_list = ['G47']

In [None]:
for i in range(len(ndd_list)):
    ndd = ndd_list[i]
    code = sig_list[i]
    df = final_df[final_df['condition'] == ndd]
        
    kmf = KaplanMeierFitter()

    T = df['TENURE'] ## time to event
    E = df['NDD'] ## event occurred or censored
    groups = df[code] ## Create the cohorts from the 'Contract' column
    ix1 = (groups == 0) ## No virus
    ix2 = (groups == 1) ## Had virus

    kmf.fit(T[ix1], E[ix1], label='no ' + code)
    ax = kmf.plot_survival_function()
    kmf.fit(T[ix2], E[ix2], label='Had ' + code)
    ax1 = kmf.plot_survival_function(ax=ax)
    plt.xlim(0, 16)
    plt.title(f"{ndd} and " + code)
    plt.xlabel('Years in study')
    plt.ylabel(f'Percentage of individuals without {ndd}')
    plt.show() 

# PD -- three groups of PRS -- this was not used in paper

In [None]:
ndd = 'PD'
df = pd.read_csv('PD_interaction_analysis_AUG_21_2023.csv')

print(ndd)

#Find 1 standard deviation
prs_list = list(df[f'Z_{ndd}_PRS'])
a = np.std(prs_list)
print("1 SD:", a)
print('\t')

#Calculate low prs
low_prs = df[df[f'Z_{ndd}_PRS'] < -a]
print('low_prs')
print(low_prs[f'Z_{ndd}_PRS'].min())
print(low_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(low_prs))
print('\t')

#Calculate mid prs
mid_prs = df[df[f'Z_{ndd}_PRS'] <= a]
mid_prs = mid_prs[mid_prs[f'Z_{ndd}_PRS'] >= -a]
print('mid_prs')
print(mid_prs[f'Z_{ndd}_PRS'].min())
print(mid_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(mid_prs))
print('\t')

#Calculate high prs
high_prs = df[df[f'Z_{ndd}_PRS'] > a]
print('high_prs')
print(high_prs[f'Z_{ndd}_PRS'].min())
print(high_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(high_prs))

In [None]:
test = low_prs[low_prs['PD'] == 1]
test = test[test['G47'] == 1]
print(len(test))
test

In [None]:
#LOW
ndd = 'PD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'PD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'PD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

In [None]:
#LOW
ndd = 'PD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'PD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'PD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

# AD -- three groups of PRS -- no APOE

In [None]:
ndd = 'AD'
df = pd.read_csv(f'AD_NO_APOE_interaction_analysis_AUG_21_2023.csv')
print(ndd)

#Find 1 standard deviation
prs_list = list(df[f'Z_{ndd}_PRS'])
a = np.std(prs_list)
print("1 SD:", a)
print('\t')

#Calculate low prs
low_prs = df[df[f'Z_{ndd}_PRS'] < -a]
print('low_prs')
print(low_prs[f'Z_{ndd}_PRS'].min())
print(low_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(low_prs))
print('\t')

#Calculate mid prs
mid_prs = df[df[f'Z_{ndd}_PRS'] <= a]
mid_prs = mid_prs[mid_prs[f'Z_{ndd}_PRS'] >= -a]
print('mid_prs')
print(mid_prs[f'Z_{ndd}_PRS'].min())
print(mid_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(mid_prs))
print('\t')

#Calculate high prs
high_prs = df[df[f'Z_{ndd}_PRS'] > a]
print('high_prs')
print(high_prs[f'Z_{ndd}_PRS'].min())
print(high_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(high_prs))

In [None]:
test = low_prs[low_prs['AD'] == 1]
test = test[test['F51'] == 1]
print(len(test))
test

In [None]:
#LOW
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

In [None]:
#LOW
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

# AD -- three groups of PRS -- with APOE¶

In [None]:
ndd = 'AD'
df = pd.read_csv(f'AD_with_APOE_interaction_analysis_AUG_21_2023.csv')
print(ndd)

#Find 1 standard deviation
prs_list = list(df[f'Z_{ndd}_PRS'])
a = np.std(prs_list)
print("1 SD:", a)
print('\t')

#Calculate low prs
low_prs = df[df[f'Z_{ndd}_PRS'] < -a]
print('low_prs')
print(low_prs[f'Z_{ndd}_PRS'].min())
print(low_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(low_prs))
print('\t')

#Calculate mid prs
mid_prs = df[df[f'Z_{ndd}_PRS'] <= a]
mid_prs = mid_prs[mid_prs[f'Z_{ndd}_PRS'] >= -a]
print('mid_prs')
print(mid_prs[f'Z_{ndd}_PRS'].min())
print(mid_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(mid_prs))
print('\t')

#Calculate high prs
high_prs = df[df[f'Z_{ndd}_PRS'] > a]
print('high_prs')
print(high_prs[f'Z_{ndd}_PRS'].min())
print(high_prs[f'Z_{ndd}_PRS'].max())
print("Number of samples:", len(high_prs))

In [None]:
#LOW
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
#MID
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
#HIGH
ndd = 'AD'
variable = 'G47'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=low_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=mid_prs).fit()
print(fitted.summary())

In [None]:
ndd = 'AD'
variable = 'F51'

this_formula = ndd + f"~ {variable} + AGE_OF_RECRUIT + GENETIC_SEX + TOWNSEND"
fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=high_prs).fit()
print(fitted.summary())