In [None]:
import pandas as pd
import numpy as np

In [None]:
# Select your NDD and the date
ndd = 'DEM'
date = 'MAY_05_2025'

In [None]:
# Select the NDD case files created in step 01
#cases = pd.read_csv('AD_cases_n666.csv')
#cases = pd.read_csv(f'PD_cases_n1713.csv')
cases = pd.read_csv('DEM_cases_n2825.csv')
cases

In [None]:
# Rename columns to match my code
cases = cases.rename(columns = {'person_id':'ID', f'{ndd}_date':f'{ndd}_DATE'}) 
cases = cases[['ID', 'date_of_birth', 'sex_at_birth', f'{ndd}_DATE']]
cases = cases.sort_values(by = f'{ndd}_DATE')
cases

In [None]:
#Load controls created in step 02
controls = pd.read_csv('controls_60_n135891.csv')
controls[f'{ndd}_DATE'] = np.nan
controls = controls.rename(columns = {'person_id':'ID'}) 
controls = controls[['ID', 'date_of_birth', 'sex_at_birth', f'{ndd}_DATE']]
controls

In [None]:
# Combine cases and controls
df = pd.concat([cases, controls])

#Check to make sure no duplicate IDs
print(df.ID.value_counts())

df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')

#Check to make sure no duplicate IDs
print(df.ID.value_counts())

df

In [None]:
# Add DEATH YEAR from file created in step 02
d = pd.read_csv('death_date.csv')
d = d.rename(columns = {'person_id':'ID', 'death_date':'DATE_OF_DEATH'})
d

In [None]:
#Merge with cases/controls
df = df.merge(d, left_on = 'ID', right_on = 'ID', how = 'left')
df

In [None]:
# Add recruit year from file created in step 02
r = pd.read_csv('primary_consent_date.csv')
r = r.rename(columns = {'person_id':'ID', 'primary_consent_date':'recruit_date'})
r

In [None]:
#Merge with cases/controls
df = df.merge(r, left_on = 'ID', right_on = 'ID', how = 'left')
df

In [None]:
# people with drug exposure data from file created in step 02
de = pd.read_csv('people_with_drug_data.csv')
de

In [None]:
# List of people who have drug data available
people_list = list(de['person_id'])
print(len(people_list))

In [None]:
# Look at df
df

In [None]:
# Only keep people who have drug data available
df = df[df['ID'].isin(people_list)]
df

In [None]:
#Check for NAs
print('Sex:', df.sex_at_birth.isna().value_counts())
print('Birth year:', df.date_of_birth.isna().value_counts())
#print('AGE:', df.AGE_OF_RECRUIT.isna().value_counts())
print('recruit date:', df.recruit_date.isna().value_counts())

In [None]:
#Check number of cases and controls
df[f'{ndd}_DATE'].isna().value_counts()

# Add drug codes

In [None]:
meds2 = pd.read_csv('meds_with_cleaned_name_MAY_05_2025.csv')
list_drugs = list(set(list(meds2['cleaned_med'])))
print(len(list_drugs))
meds2.cleaned_med.value_counts()

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd

In [None]:
my_bucket = os.getenv('WORKSPACE_BUCKET')

In [None]:
# example of drug csv
code = pd.read_csv(f'{my_bucket}/data/drugs/sertraline_with_date.csv')
code.drug_name.value_counts()

In [None]:
# Add drug data to df
for drug in list_drugs:
    code = pd.read_csv(f'{my_bucket}/data/drugs/{drug}_with_date.csv')
    code = code[['ID', f'{drug}_DATE', f'{drug}_N']]
    df = df.merge(code, left_on = 'ID', right_on = 'ID', how = 'left')

In [None]:
# Create list of headers drug + DATE
codes = []
for drug in list_drugs:
    a = drug + '_DATE'
    codes.append(a)
    
print(len(codes))

# Prep for cox

In [None]:
#Set variables
STUDY_ENDS = '2024-01-01'

In [None]:
#Drop duplicates
df = df.sort_values(by = f'{ndd}_DATE')
df = df.drop_duplicates(subset = 'ID', keep = 'first')
print(len(df))
df.head()

In [None]:
#Create tenure for people with NDD - select folks with an NDD date
has_NDD = df[~df[ndd + '_DATE'].isna()]

#Calculate the tenure, i.e. the time from the beginning of the study to their time of NDD diagnosis
has_NDD['tenure'] = (pd.to_datetime(has_NDD[ndd + '_DATE'], errors = 'coerce') - pd.to_datetime(has_NDD['recruit_date'], errors = 'coerce')).dt.days/365

#Only keep people who got NDD after they joined study
has_NDD = has_NDD[has_NDD['tenure'] > 0]

#Add age_at_tenure to people with an NDD
has_NDD['age_at_tenure'] = (pd.to_datetime(has_NDD[ndd + '_DATE'], errors = 'coerce') - pd.to_datetime(has_NDD['date_of_birth'], errors = 'coerce')).dt.days/365

#Create tenure for people without NDD - select folks with no NDD date
NDD_free = df[df[ndd + '_DATE'].isna()]

#For people without NDD, break into dead and alive
alive = NDD_free[NDD_free['DATE_OF_DEATH'].isna()]
dead = NDD_free[~NDD_free['DATE_OF_DEATH'].isna()]

#Calculate the tenure for people who are still alive, i.e. the time from the beginning of the study to the end of study
alive['tenure'] = (pd.to_datetime(STUDY_ENDS) - pd.to_datetime(alive['recruit_date'], errors = 'coerce')).dt.days/365

#Add age_at_tenure for people who are still alive
alive['age_at_tenure'] = (pd.to_datetime(STUDY_ENDS) - pd.to_datetime(alive['date_of_birth'], errors = 'coerce')).dt.days/365

#Calculate the tenure for people who are still dead, i.e. the time from the beginning of the study to the end of study
dead['tenure'] = (pd.to_datetime(dead['DATE_OF_DEATH']) - pd.to_datetime(dead['recruit_date'])).dt.days/365

#Add age_at_tenure for people who are dead
dead['age_at_tenure'] = (pd.to_datetime(dead['DATE_OF_DEATH']) - pd.to_datetime(dead['date_of_birth'])).dt.days/365

#Combine two groups
df = pd.concat([has_NDD, alive, dead])

In [None]:
# look at df
df

In [None]:
#Encode NDD to 1 or 0
df[ndd] = np.where(df[ndd + '_DATE'].isna(), 0, 1)

#GENETIC_SEX to 1 or 2
df.loc[df.sex_at_birth == 'Female', 'SEX'] = '2'
df.loc[df.sex_at_birth == 'Male', 'SEX'] = '1'

In [None]:
#To do the high, medium, low analysis
new_drug_list = []
omit_drug_list = []
for code in list_drugs:
    df['Lag_' + code] = (pd.to_datetime(df[code + '_DATE'], errors = 'coerce') - pd.to_datetime(df['recruit_date'], errors = 'coerce')).dt.days/365
    
    quantiles = df[f'{code}_N'].quantile([0.25, 0.5, 0.75])
    #print(code)
    low = quantiles[0.25]
    #print(low)
    high = quantiles[0.75]
    #print(high)
    
    df['low_' + code] = np.where((df['Lag_' + code] < df['tenure']) & (df[code + '_N'] <= low), 1, 0)
    df['high_' + code] = np.where((df['Lag_' + code] < df['tenure']) & (df[code + '_N'] >= high), 1, 0)
    
    if high-low > 1:
        df['med_' + code] = np.where((df['Lag_' + code] < df['tenure']) & ((df[code + '_N'] > low) & (df[code + '_N'] < high)), 1, 0)
        new_drug_list.append(code)
    else:
        omit_drug_list.append(code)
        print(code)
        print(low)
        print(high)

In [None]:
# for standard run and lags
for code in list_drugs:
    df['Lag_' + code] = (pd.to_datetime(df[code + '_DATE'], errors = 'coerce') - pd.to_datetime(df['recruit_date'], errors = 'coerce')).dt.days/365
        
    #Select data if it happened before study end -- lag 0
    df['QC0_' + code] = np.where((df['Lag_' + code] < df['tenure']), 1, 0)
    
    #Select data only 10+ years before study end
    df['QC10+_' + code] = np.where((df['tenure'] - df['Lag_' + code] > 10), 1, 0)

In [None]:
print(len(omit_drug_list))
print(len(new_drug_list))
print(new_drug_list)

In [None]:
# look at part of df
print(len(df))
test = df[['ID', 'SEX', 'age_at_tenure', f'{ndd}_DATE', 'date_of_birth', 'DATE_OF_DEATH','recruit_date', 'tenure', f'{ndd}']]
test

In [None]:
# Save df to use in cox model
df.to_csv(f'{ndd}_with_tenure_{date}.csv', header = True, index = None)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = df   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = f'{ndd}_with_tenure_{date}.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))