# Prep df for cox adding ICD10 codes and APOE 

In [None]:
import pandas as pd
import numpy as np

In [None]:
icd10_codes = ['A04', 'B02', 'B37', 'E10', 'E11', 'E27', 'E78', 'E87', 'F20', 'F31', 'F32', 'F33', 'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F48', 'F50', 'F51', 'G40', 'G43', 'G47', 'H66', 'I10', 'I11', 'I12', 'I15', 'I20', 'I21', 'I25', 'I47', 'I48', 'I49', 'I50', 'I60', 'I61', 'I62', 'I63', 'I65', 'I66', 'I67', 'I69', 'I82', 'K04', 'K05', 'K20', 'K21', 'K22', 'K25', 'K31', 'K51', 'K59', 'K70', 'K71', 'K72', 'K73', 'K74', 'K75', 'K76', 'L25', 'L40', 'L50', 'M06', 'M13', 'M15', 'M16', 'M17', 'M18', 'M19', 'M32', 'M45', 'M79', 'M80', 'M81', 'M88', 'N04', 'N10', 'N18', 'N19', 'N30', 'N31', 'N32', 'N39', 'N40', 'N94']
print(len(icd10_codes))

In [None]:
for code in icd10_codes:
    df = pd.read_csv(f'{code}_with_date.csv')
    df = df.rename(columns = {'condition_start_datetime':code, 'person_id':'ID'})
        
    #remove duplicate IDs, keeping the first condition
    df = df.sort_values(by = code)
    df = df.drop_duplicates(subset = 'ID', keep = 'first')
    df = df[['ID', code]]
    df.to_csv(f'{code}_with_date_dup_dropped.csv', header = True, index = False)

In [None]:
# Check one ICD10 code
test = pd.read_csv(f'{code}_with_date_dup_dropped.csv')
test = test.sort_values(by = code)
test

In [None]:
# Check to make sure only first ID per code is still included
test.ID.value_counts()

# Add covariates to original dataframe

In [None]:
# Load df -- we did this for all NDDs: AD, PD, and DEM

ndd = 'PD'
df = pd.read_csv(f'{ndd}_with_tenure_MAY_05_2025.csv', low_memory = False)
df

In [None]:
print(len(icd10_codes))

In [None]:
for code in icd10_codes:
    c = pd.read_csv(f'{code}_with_date_dup_dropped.csv')
    c = c[['ID', code]]
    df = df.merge(c, left_on = 'ID', right_on = 'ID', how = 'left')
    print(len(df))

In [None]:
# Check to see counts
for condition in icd10_codes:
    print(condition)
    print(df[f'{condition}'].isna().value_counts())

In [None]:
#We only want to include an ICD10 code as a covariate if it was recorded BEFORE the end of the study
for code in icd10_codes:
    df['Lag_' + code] = (pd.to_datetime(df[code], errors = 'coerce') - pd.to_datetime(df['recruit_date'])).dt.days/365
        
    #Select data if it happened before study end -- lag 0
    df['QC0_' + code] = np.where((df['Lag_' + code] < df['tenure']), 1, 0)

# Add APOE status

In [None]:
# Load APOE file created in 06_AoU_pull_APOE_status_V8.ipynb
apoe = pd.read_csv('apoe_for_meds_FEB_2025.csv')
apoe = apoe[['IID', 'APOE_GENOTYPE']]
apoe

In [None]:
# Merge with df created above
df = df.merge(apoe, left_on = 'ID', right_on = 'IID', how = 'left')
df

In [None]:
# Check the counts
df.APOE_GENOTYPE.value_counts(dropna = False)

In [None]:
#Remove samples with no APOE data
df = df[~df['APOE_GENOTYPE'].isna()]

#Remove samples with unknown
df = df[df['APOE_GENOTYPE'] != 'unknown']
df

In [None]:
# Check value_counts for APOE
df.APOE_GENOTYPE.value_counts(dropna = False)

In [None]:
# One-hot encode the 'APOE_GENOTYPE' column
df_encoded = pd.get_dummies(df, columns=['APOE_GENOTYPE'], prefix='', prefix_sep='')

# Display the result
df_encoded.columns

In [None]:
# Look at df
df_encoded

In [None]:
# save file
df_encoded.to_csv(f'AoU_{ndd}_with_icd10_with_APOE_MAY_12_2025.csv', header = True, index=False)

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = df_encoded   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
#destination_filename = f'AoU_{ndd}_with_icd10_MAY_05_2025.csv'
destination_filename = f'AoU_{ndd}_with_icd10_with_APOE_MAY_12_2025.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr


In [None]:
# # This snippet assumes you run setup first

# # This code copies file in your Google Bucket and loads it into a dataframe

# # Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
# name_of_file_in_bucket = 'apoe_for_meds_FEB_2025.csv'

# ########################################################################
# ##
# ################# DON'T CHANGE FROM HERE ###############################
# ##
# ########################################################################

# # get the bucket name
# my_bucket = os.getenv('WORKSPACE_BUCKET')

# # copy csv file from the bucket to the current working space
# os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

# print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# # save dataframe in a csv file in the same workspace as the notebook
# my_dataframe = pd.read_csv(name_of_file_in_bucket)
# my_dataframe.head()
