# Data Prep

In [1]:
# !pip install statsmodels
# !pip install -U seaborn
# !pip install statsmodels
# !pip install lifelines
# !pip install scikit-learn

In [2]:
# Imports here.
import numpy as np
import pandas as pd
import os
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import scipy
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

In [3]:
#!dx download -r 'data/files_for_cox'
#!dx download -r 'data/PRS'

# Prep PD

In [4]:
ndd = 'PD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'{ndd}_with_Z_score_for_graphs_april_30.csv')
prs = prs[['ID', 'SCORE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Z_score',
       'Z_age', 'Z_PC1', 'Z_PC2', 'Z_PC3', 'Z_PC4', 'Z_PC5' ]]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_score'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_score']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_score']*(df['G47']+1)

df  

No PRS scores with a value of 0


Unnamed: 0,ID,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,...,AGE_OF_RECRUIT,BIRTH_YEAR,tenure,PD_DATE,PD,F51,G47,AAO,interactor_PD_F51,interactor_PD_G47
0,5986755,-0.005752,-13.3959,5.89336,-2.398670,-0.052244,0.017704,0.818861,0.292830,-0.646379,...,60,1945,0.498630,2005-07-02,1,0,0,60.539726,0.818861,0.818861
1,5091212,-0.012065,-10.3817,3.22082,-0.419226,3.640730,-6.380780,-1.125486,1.310861,1.223686,...,67,1939,0.391781,2006-05-24,1,0,0,67.438356,-1.125486,-1.125486
2,5164402,-0.006186,-11.7090,2.20405,-2.077410,3.828630,-5.312990,0.685058,0.729129,0.400205,...,63,1943,0.498630,2006-07-02,1,0,0,63.542466,0.685058,0.685058
3,1942953,-0.011824,-13.5989,2.63708,-5.808340,9.143290,11.225900,-1.051476,-0.288902,-0.772324,...,56,1951,0.191781,2007-03-12,1,0,0,56.230137,-1.051476,-1.051476
4,2548358,-0.004134,-12.9616,4.96654,-2.865700,3.600200,2.566680,1.317089,1.456294,-0.376931,...,68,1939,0.315068,2007-04-26,1,0,0,68.361644,1.317089,1.317089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240905,5988964,-0.008323,-13.4550,4.31271,-0.708079,-0.585742,-8.072440,0.026795,1.310861,-0.683045,...,67,1941,12.419178,,0,0,0,,0.026795,0.026795
240906,3181859,-0.008959,-13.2968,5.55025,-2.876440,2.725760,12.277300,-0.169099,0.874562,-0.584895,...,64,1943,7.550685,,0,0,0,,-0.169099,-0.169099
240907,5906295,-0.003869,-12.6670,4.92301,-0.249682,5.808390,20.456700,1.398833,0.438263,-0.194156,...,61,1945,2.342466,,0,0,0,,1.398833,1.398833
240908,5866742,-0.005556,-11.7895,4.66550,0.616765,-3.179100,-8.885110,0.878974,0.583696,0.350261,...,62,1946,13.816438,,0,0,0,,0.878974,0.878974


In [5]:
df.PD.value_counts()

0    238024
1      2886
Name: PD, dtype: int64

In [6]:
df.to_csv(f'{ndd}_interaction_analysis_april_30.csv', header = True, index = False)

In [8]:
! dx upload {ndd}_interaction_analysis_april_30.csv  --path /data/interaction/{ndd}_interaction_analysis_april_30.csv

ID                          file-GjvK17jJq9vz8QYFv5JKk8g1
Class                       file
Project                     project-GZBqBx8Jq9vpQ6729F24BjYX
Folder                      /data/interaction
Name                        PD_interaction_analysis_april_30.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon May  6 13:52:31 2024
Created by                  klevine22
 via the job                job-GjvGkgQJq9vpjxfg7PQ7KQbv
Last modified               Mon May  6 13:52:32 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


# Prep AD without APOE

In [9]:
ndd = 'AD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'{ndd}_NO_APOE_with_Z_score_for_graphs_april_30.csv')
prs = prs[['ID', 'SCORE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Z_score',
       'Z_age', 'Z_PC1', 'Z_PC2', 'Z_PC3', 'Z_PC4', 'Z_PC5' ]]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_score'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_score']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_score']*(df['G47']+1)

df  

No PRS scores with a value of 0


Unnamed: 0,ID,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,...,AGE_OF_RECRUIT,BIRTH_YEAR,tenure,AD_DATE,AD,F51,G47,AAO,interactor_AD_F51,interactor_AD_G47
0,3559505,-0.001864,-12.3277,2.811810,-5.192830,6.021220,0.711214,-0.057379,1.302759,0.017380,...,67,1940,0.372603,2007-05-17,1,0,0,67.419178,-0.057379,-0.057379
1,5426874,0.002704,-15.3076,3.797150,-3.031950,3.952750,6.307610,1.425655,1.447955,-1.831776,...,68,1939,0.800000,2007-10-20,1,0,0,68.846575,1.425655,1.425655
2,1940961,-0.002273,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,-0.190179,0.721973,-0.178153,...,63,1944,1.498630,2008-07-01,1,0,0,64.542466,-0.190179,-0.190179
3,2893229,0.000253,-14.0739,0.909499,-2.192330,2.253470,-8.586080,0.629943,0.141187,-1.066212,...,59,1948,2.060274,2009-01-22,1,0,0,61.101370,0.629943,0.629943
4,5784514,0.000933,-12.6081,2.801040,0.067058,2.482140,1.149460,0.850472,-0.584796,-0.156621,...,54,1954,1.063014,2009-01-23,1,0,0,55.098630,0.850472,0.850472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241574,5988964,-0.000065,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,0.526676,1.302759,-0.682158,...,67,1941,12.419178,,0,0,0,,0.526676,0.526676
241575,3181859,0.001537,-13.2968,5.550250,-2.876440,2.725760,12.277300,1.046598,0.867169,-0.583988,...,64,1943,7.550685,,0,0,0,,1.046598,1.046598
241576,5906295,-0.001356,-12.6670,4.923010,-0.249682,5.808390,20.456700,0.107250,0.431580,-0.193171,...,61,1945,2.342466,,0,0,0,,0.107250,0.107250
241577,5866742,-0.000623,-11.7895,4.665500,0.616765,-3.179100,-8.885110,0.345470,0.576776,0.351356,...,62,1946,13.816438,,0,0,0,,0.345470,0.345470


In [10]:
df.AD.value_counts()

0    237998
1      3581
Name: AD, dtype: int64

In [11]:
df.to_csv(f'{ndd}_NO_APOE_interaction_analysis_april_30.csv', header = True, index = False)

In [12]:
! dx upload {ndd}_NO_APOE_interaction_analysis_april_30.csv  --path /data/interaction/{ndd}_NO_APOE_interaction_analysis_april_30.csv

ID                          file-GjvK22jJq9vV1JYYpf2vqyqx
Class                       file
Project                     project-GZBqBx8Jq9vpQ6729F24BjYX
Folder                      /data/interaction
Name                        AD_NO_APOE_interaction_analysis_april_30.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Mon May  6 13:54:19 2024
Created by                  klevine22
 via the job                job-GjvGkgQJq9vpjxfg7PQ7KQbv
Last modified               Mon May  6 13:54:21 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"


# Prep AD with APOE

In [13]:
ndd = 'AD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'{ndd}_with_Z_score_for_graphs_april_30.csv')
prs = prs[['ID', 'SCORE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Z_score',
       'Z_age', 'Z_PC1', 'Z_PC2', 'Z_PC3', 'Z_PC4', 'Z_PC5' ]]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_score'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_score']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_score']*(df['G47']+1)

df  

No PRS scores with a value of 0


Unnamed: 0,ID,SCORE,PC1,PC2,PC3,PC4,PC5,Z_score,Z_age,Z_PC1,...,AGE_OF_RECRUIT,BIRTH_YEAR,tenure,AD_DATE,AD,F51,G47,AAO,interactor_AD_F51,interactor_AD_G47
0,3559505,-0.001782,-12.3277,2.811810,-5.192830,6.021220,0.711214,-0.538542,1.302759,0.017380,...,67,1940,0.372603,2007-05-17,1,0,0,67.419178,-0.538542,-0.538542
1,5426874,0.013916,-15.3076,3.797150,-3.031950,3.952750,6.307610,1.951025,1.447955,-1.831776,...,68,1939,0.800000,2007-10-20,1,0,0,68.846575,1.951025,1.951025
2,1940961,-0.002174,-12.6428,2.248000,-4.424400,-2.467130,-2.220740,-0.600591,0.721973,-0.178153,...,63,1944,1.498630,2008-07-01,1,0,0,64.542466,-0.600591,-0.600591
3,2893229,0.000242,-14.0739,0.909499,-2.192330,2.253470,-8.586080,-0.217409,0.141187,-1.066212,...,59,1948,2.060274,2009-01-22,1,0,0,61.101370,-0.217409,-0.217409
4,5784514,0.012221,-12.6081,2.801040,0.067058,2.482140,1.149460,1.682297,-0.584796,-0.156621,...,54,1954,1.063014,2009-01-23,1,0,0,55.098630,1.682297,1.682297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241574,5988964,-0.000062,-13.4550,4.312710,-0.708079,-0.585742,-8.072440,-0.265658,1.302759,-0.682158,...,67,1941,12.419178,,0,0,0,,-0.265658,-0.265658
241575,3181859,0.001470,-13.2968,5.550250,-2.876440,2.725760,12.277300,-0.022737,0.867169,-0.583988,...,64,1943,7.550685,,0,0,0,,-0.022737,-0.022737
241576,5906295,0.010032,-12.6670,4.923010,-0.249682,5.808390,20.456700,1.335036,0.431580,-0.193171,...,61,1945,2.342466,,0,0,0,,1.335036,1.335036
241577,5866742,-0.000596,-11.7895,4.665500,0.616765,-3.179100,-8.885110,-0.350322,0.576776,0.351356,...,62,1946,13.816438,,0,0,0,,-0.350322,-0.350322


In [14]:
df.AD.value_counts()

0    237998
1      3581
Name: AD, dtype: int64

In [None]:
df.to_csv(f'{ndd}_with_APOE_interaction_analysis_april_30.csv', header = True, index = False)

In [None]:
! dx upload {ndd}_with_APOE_interaction_analysis_april.csv  --path /data/interaction/{ndd}_with_APOE_interaction_analysis_OCT_23_2023.csv