In [None]:
# installs
!pip install --upgrade pip
!pip install pandas==1.5.3
!pip install statsmodels
!pip install lifelines==0.26.4

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection
from lifelines import CoxPHFitter

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load one df to test
ndd = 'PD'
df = pd.read_csv(f'{ndd}_with_tenure_MAY_05_2025.csv', parse_dates = True)
df

In [None]:
#Select NDDs
ndd_list = ['AD', 'PD', 'DEM']

#Load list of codes
meds2 = pd.read_csv('meds_with_cleaned_name_MAY_05_2025.csv')
codes = list(set(list(meds2['cleaned_med'])))
print(len(codes))

In [None]:
print(ndd_list)

In [None]:
timeline = 'ever taken'
model = 'COX'
lag = "0"

results = []

for ndd in ndd_list:
    
    #Load df
    df = pd.read_csv(f'{ndd}_with_tenure_MAY_05_2025.csv', parse_dates = True)
    
    # Find codes to use so we don't have to use EVERYTHING
    codes_with_data = []
    lag = '0'

    for code in codes:
        m = df[['age_at_tenure', 'SEX', 'tenure', ndd, f'QC{lag}_' + code]]
        n=sum(m[f'QC{lag}_'+ code])
        df_pair = m[m[f'QC{lag}_'+ code]==1]
        n_pairs = sum(df_pair[ndd])
        if n == 0:
            pass
        elif n_pairs < 10:
            pass
        elif n == n_pairs:
            pass
        else:
            print(code)
            codes_with_data.append(code)
    
    print(ndd)
    print(len(codes_with_data))
    
    for code in codes_with_data:
        
        m = df[['age_at_tenure', 'SEX', 'tenure', ndd, f'QC{lag}_' + code]]
        n=sum(m[f'QC{lag}_'+ code])
        df_pair = m[m[f'QC{lag}_'+ code]==1]
        n_pairs = sum(df_pair[ndd])
        
        cph = CoxPHFitter()
        cph.fit(m, duration_col = 'tenure', event_col = ndd, show_progress=False, step_size = 0.01)
        #cph.print_summary()
        #cph.plot()
        
        actual_p = cph._compute_p_values()
        results_df = cph.summary
        results_df = results_df.reset_index()
        test = results_df.iloc[2]

        covariate = code
        HR = test['exp(coef)']
        ci_min = test['exp(coef) lower 95%']
        ci_max = test['exp(coef) upper 95%']
        p = actual_p[2]

        print(covariate, ndd, HR, ci_min, ci_max, p, n_pairs, n)
        results.append((covariate, ndd, model, timeline, lag, HR, ci_min, ci_max, p, n_pairs, n))
            
cox1 = pd.DataFrame(results, columns=('PRIOR','OUTCOME', 'MODEL','TIMELINE', 'LAG', 'HR', 'ci_min', "ci_max", 'P_VAL', "N_pairs", "N"))

In [None]:
#Combine results
output = pd.concat([cox1])

#Adding FDR Correction

#Sort P-values
output = output.sort_values(by = "P_VAL")

#Drop Nan-values
output = output.dropna()

#FDR Correction
rejected, p_corr = fdrcorrection(output['P_VAL'], is_sorted=True)
output['P_CORR'] = p_corr
output['SIGNIFICANT'] = rejected

output


In [None]:
import os
import subprocess
import numpy as np
import pandas as pd

In [None]:
date = 'MAY_05_2025'
output.to_csv(f'AoU_{date}_results_lag_zero.csv', header = True, index = False)

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = output   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = f'AoU_{date}_results_lag_zero.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr


In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))