# Data Prep

In [None]:
!pip install statsmodels
!pip install -U seaborn
!pip install statsmodels
!pip install lifelines
!pip install scikit-learn

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import scipy
from scipy import stats
from statsmodels.stats.weightstats import ztest as ztest

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

In [None]:
#!dx download -r 'data/files_for_cox'

In [None]:
!dx download -r 'data/PRS'

# Prep PD

In [None]:
ndd = 'PD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'PRS/{ndd}_with_PRS_for_graphs.csv')
prs = prs[['ID', f'Z_{ndd}_PRS']]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_{ndd}_PRS'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_{ndd}_PRS']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_{ndd}_PRS']*(df['G47']+1)

df  

In [None]:
df.PD.value_counts()

In [None]:
df.to_csv(f'{ndd}_interaction_analysis_OCT_23_2023.csv', header = True, index = False)

In [None]:
! dx upload {ndd}_interaction_analysis_OCT_23_2023.csv  --path /data/interaction/{ndd}_interaction_analysis_OCT_23_2023.csv

# Prep AD without APOE

In [None]:
ndd = 'AD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'PRS/{ndd}_with_PRS_NO_APOE_for_graphs.csv')
prs = prs[['ID', f'Z_{ndd}_PRS']]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_{ndd}_PRS'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_{ndd}_PRS']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_{ndd}_PRS']*(df['G47']+1)

df  

In [None]:
df.AD.value_counts()

In [None]:
df.to_csv(f'{ndd}_NO_APOE_interaction_analysis_OCT_23_2023.csv', header = True, index = False)

In [None]:
! dx upload {ndd}_NO_APOE_interaction_analysis_OCT_23_2023.csv  --path /data/interaction/{ndd}_NO_APOE_interaction_analysis_OCT_23_2023.csv

# Prep AD with APOE

In [None]:
ndd = 'AD'

#Load DF
df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv')

#Select Columns needed with right censoring
df = df[['ID', 'GENETIC_SEX', 'TOWNSEND', "AGE_OF_RECRUIT", 'BIRTH_YEAR', 'tenure', f'{ndd}_DATE', ndd, 'QC0_F51_DATE', 'QC0_G47_DATE']]

#Rename columns
df = df.rename(columns = {'QC0_F51_DATE':'F51', 'QC0_G47_DATE':'G47'})

#Remove duplicate IDs
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Create AAO column
df['AAO'] = (pd.to_datetime(df[ndd + '_DATE']) - pd.to_datetime(df['BIRTH_YEAR'], format='%Y')).dt.days/365

#Load PRS data
prs = pd.read_csv(f'PRS/{ndd}_with_PRS_for_graphs.csv')
prs = prs[['ID', f'Z_{ndd}_PRS']]

#Drop duplicates
prs = prs.drop_duplicates(subset = 'ID', keep = 'first')

#Merge
df = prs.merge(df, left_on = 'ID', right_on = 'ID', how = 'inner')

#Check that there are no PRS scores with a value of 0
test = df[df[f'Z_{ndd}_PRS'] == 0]
if len(test) == 0:
    print("No PRS scores with a value of 0")
else:
    print("There is a PRS score with a value of 0")
    
#Set interaction variables
df[f'interactor_{ndd}_F51'] = df[f'Z_{ndd}_PRS']*(df['F51']+1)
df[f'interactor_{ndd}_G47'] = df[f'Z_{ndd}_PRS']*(df['G47']+1)

df  

In [None]:
df.AD.value_counts()

In [None]:
df.to_csv(f'{ndd}_with_APOE_interaction_analysis_OCT_23_2023.csv', header = True, index = False)

In [None]:
! dx upload {ndd}_with_APOE_interaction_analysis_OCT_23_2023.csv  --path /data/interaction/{ndd}_with_APOE_interaction_analysis_OCT_23_2023.csv