### Importing packages

In [None]:
%%capture
%pip install lifelines
%pip install statsmodels
%pip install rpy2


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import re
import math
from lifelines import CoxPHFitter
from statsmodels.stats.multitest import fdrcorrection
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()

ro.r('install.packages("cmprsk", repos="http://cran.r-project.org")')

# You may need this once per session:
%load_ext rpy2.ipython

### Cohort creation

In [None]:
PROJ_DIR = Path('/mnt/project/')
DATA_DIR = PROJ_DIR / 'old_data'

In [None]:
#icd10 to definition conversion
icd10_conversion = pd.read_csv(DATA_DIR / 'icd10_conversion_2.csv')


# covariates such as date of birth, sex and townsend
covariates = pd.read_csv(DATA_DIR / 'f_covariate.csv')
covariates = covariates.drop(covariates.columns[0], axis=1)


# PC table has the principal components 1,2,3,4,5
PC = pd.read_csv(DATA_DIR / 'PCs.csv')


# H_controls has the list of eids for the healthy participants
H_controls = pd.read_csv(DATA_DIR / 'healthy_control_11_19_23.csv')
H_controls = H_controls[["eid"]].copy()


# attending_info has the date of attending the assessment center, Year of birth and Age_at_recruitment
attending_info = pd.read_csv(DATA_DIR / 'attennding_info.csv')
# calculate the recruitement year for each participant:
#By adding the year of birth to the age at recruitment, the operation effectively calculates the year in which each participant was recruited
attending_info['recruit_year'] = attending_info['Year_of_birth'] + attending_info['Age_at_recruitment']


# Death contains date of Date_of_death for participants
Death = pd.read_csv(DATA_DIR / 'Death_info.csv')


#AD_PRS contains PRS score for participants (PRS includes the APOE allel)
AD_PRS = pd.read_csv(DATA_DIR / 'AD_PRS_final.sscore',sep='\t')
AD_PRS.rename(columns={"#FID":"eid"},inplace = True)
# AD_PRS_without_APOE prs score without APOE allele
AD_PRS_without_APOE = pd.read_csv(DATA_DIR / 'AD_PRS_final_without_apoe.sscore',sep='\t')
AD_PRS_without_APOE.rename(columns={"#FID":"eid"},inplace = True)


# APOE has the classification of E4E4 apoe for each participants
APOE = pd.read_csv(DATA_DIR / 'apoe_e4_designation.csv')
APOE.rename(columns={"FID":"eid"},inplace = True)
APOE.drop(columns = ['IID'], inplace = True)


#df_icd10 has 158 icd10 codes for digestive, endocrine and metabolic disorders
df_icd10 = pd.read_csv(DATA_DIR / 'f_icd10.csv')
df_icd10 = df_icd10.drop(df_icd10.columns[0], axis=1)

## dataframe for F00 data
df_f00 = pd.read_csv(DATA_DIR / 'F00.csv')
#add df_f00 to the table
df_icd10  = pd.merge(df_icd10 , df_f00[["eid","p130836"]], on = "eid")
# Replace NaN in 'p131036' with values from 'p130836' only where 'p130836' is not NaN and 'p131036' is NaN
df_icd10.loc[df_icd10['p130836'].notna() & df_icd10['p131036'].isna(), 'p131036'] = df_icd10['p130836']
df_icd10.drop('p130836', axis=1, inplace=True)


# add the data set for the european descends
european = pd.read_csv(DATA_DIR / 'Genetic_relatedness_pairing.csv')
# p22006 data field shows if the person is Caucasian or not , 1 means the person is Caucasian.
# filter out only the one who have p22006 equal to 1
#only keep european individuals p22006 =1
eid_list = european[european['p22006'] == 1]['eid'].tolist()


#related individuals information:
relatedness = pd.read_csv(PROJ_DIR / 'Bulk/Genotype Results/Genotype calls/ukb_rel.dat', sep=' ')
# this file only has the related individuals. so we just remove them at the 3rd-degree relationships
rel_remove = relatedness[relatedness['Kinship']> 0.0884]
rel_remove.info()
# remove the first column ID1 formt the ICD10 data set
eids_to_remove = rel_remove["ID1"].tolist()


df_icd10 = df_icd10[df_icd10['eid'].isin(eid_list)]
df_icd10 = df_icd10[~df_icd10['eid'].isin(eids_to_remove)]


# convert the covariates codes to meaningful names
covariate_conversio = {'p21022' :"Age_at_recruitment", "p34" : "Year_of_birth", "p52" : 'Month_of_birth', 'p31' : "sex", 'p22189' :'Townsend_deprivation_index'}
covariates = covariates.rename(columns=covariate_conversio)

***
The recruitment year will be considered as the start date of the study for each participant.
The end of the study for a participant will be determined by either the onset of AD (Alzheimer's Disease), death, or the date '2023-01-01'
whichever comes first.
***

In [None]:
#alzheimer's 131036
#parkinson's disease 131022

# Keep only non-related Europeans from controls and add icd10 info
H_controls = H_controls[H_controls['eid'].isin(eid_list)]
H_controls = H_controls[~H_controls['eid'].isin(eids_to_remove)]
H_controls = H_controls.merge(df_icd10, on='eid')

# Copy the data set
AD_merged_data = df_icd10.copy()

## only AD cases
# Remove rows where column 'p131036' has value 0
AD_merged_data = AD_merged_data[~AD_merged_data['p131036'].isna()]

# Create a table including the columns for the year of recruitment, the date of Alzheimer's Disease diagnosis (p131036), and the date of death.
attending_info = attending_info[['eid','recruit_year']]
df = AD_merged_data[['eid','p131036']]
H_controls = H_controls[['eid','p131036']]

# Concatenate the DataFrames on top of each other
df = pd.concat([df, H_controls], ignore_index=True)
df = pd.merge(df, attending_info, on='eid')
Death = Death[['eid','p40000_i0_Date_of_death']]
df = pd.merge(Death, df, on='eid', how='right')

# Convert date columns to datetime
df['p40000_i0_Date_of_death'] = pd.to_datetime(df['p40000_i0_Date_of_death'])
df['p131036'] = pd.to_datetime(df['p131036'])

# Set study end date to Jan 1, 2023
study_end = pd.Timestamp('2023-01-01')

# Define stop date: earliest of AD, death, or study end
df['stop'] = df[['p131036', 'p40000_i0_Date_of_death']].min(axis=1)
df['stop'] = df['stop'].fillna(study_end)

# Keep original full stop date (if needed)
df['complete_stop_date'] = df['stop']

# Set start = recruit year
df.rename(columns={'recruit_year': 'start'}, inplace=True)

# Extract year from datetime columns
df['stop'] = df['stop'].dt.year
df['p131036_year'] = df['p131036'].dt.year

# Drop invalid entries: stop before start
df = df[df['stop'] > df['start']]

# Drop rows where AD happened before recruitment
df = df[(df['start'] <= df['p131036_year']) | df['p131036_year'].isna()]
df[(df['p131036_year'].isna()) | (df['p131036_year'] <= df['stop'])]

# Calculate duration
df['duration'] = np.where(
    df['p131036_year'].isnull(),
    df['stop'] - df['start'],
    df['p131036_year'] - df['start']
)

# Define event_type for Fine and Gray
# 0 = censored, 1 = AD, 2 = death before AD
df['event_type'] = 0
df.loc[df['p131036'].notna(), 'event_type'] = 1
df.loc[df['p131036'].isna() & df['p40000_i0_Date_of_death'].notna(), 'event_type'] = 2

#add the prs values to the table
df = pd.merge(df, AD_PRS[["eid","SCORE1_AVG"]], on = "eid")  

df = pd.merge(df, AD_PRS_without_APOE[["eid","SCORE1_AVG"]], on = "eid")  

# Calculate mean and SD for controls (where event == 0)
mean_controls = df["SCORE1_AVG_x"][df["event_type"] == 0].mean()
sd_controls = df["SCORE1_AVG_x"][df["event_type"] == 0].std()

# Compute the z-score for SCORE1_AVG
df["zSCORE"] = (df["SCORE1_AVG_x"] - mean_controls) / sd_controls

# Calculate mean and SD for controls (where event == 0)
mean_controls = df["SCORE1_AVG_y"][df["event_type"] == 0].mean()
sd_controls = df["SCORE1_AVG_y"][df["event_type"] == 0].std()

# Compute the z-score for SCORE1_AVG
df["zSCORE_without_apoe"] = (df["SCORE1_AVG_y"] - mean_controls) / sd_controls

# get the list of icd10 codes
list_of_icd10 = df_icd10.columns.tolist()[1:]
# remove pd and ad
list_of_icd10.remove("p131022")
list_of_icd10.remove("p131036")

cutoff_date = pd.to_datetime("1999-01-01")

# Load your preprocessed df, df_icd10, PC, APOE, and covariates
# Example:
# df = pd.read_csv("df.csv")
# df_icd10 = pd.read_csv("df_icd10.csv")
# PC = pd.read_csv("PC.csv")
# APOE = pd.read_csv("APOE.csv")
# covariates = pd.read_csv("covariates.csv")

# list_of_icd10 = ['p130008', 'p130010', ...]

In [None]:
dic_fg_results = {}
cutoff_date = pd.Timestamp("1999-01-01")


for var in list_of_icd10:

    print(f"Running Fine-Gray for {var}")
    
    try:
        # Merge all required data
        t = pd.merge(df[["eid", "duration", "event_type", "p131036", "start"]],
                     df_icd10[["eid", var]], on="eid")
        t = pd.merge(t, covariates[["eid", "Age_at_recruitment", "Townsend_deprivation_index", "sex"]], on="eid")
        t = pd.merge(t, PC, on="eid")
        t = pd.merge(t, APOE[["eid", "e4_copies"]], on="eid")

        # # Calculate age at recruitment
        # t["Age_at_recruitment"] = t["start"] - t["Year_of_birth"]

        # Convert and filter dates
        t["p131036"] = pd.to_datetime(t["p131036"])
        t[var] = pd.to_datetime(t[var])
        t = t[~(t[var] < cutoff_date)]
        t = t[~(t["p131036"] < cutoff_date)]

        # ICD10 after AD → NaT
        t.loc[t[var] >= t["p131036"], var] = pd.NaT

        # Binary ICD10 presence
        t[var] = (~t[var].isna()).astype(int)

        # Skip if too few cases or AD events
        if t[var].sum() < 3 or t[t[var] == 1]["event_type"].eq(1).sum() <= 1:
            print(f"Skipping {var} (too few cases)")
            continue

        # Select columns
        cols = ["duration", "event_type", var, "Age_at_recruitment", "sex", "Townsend_deprivation_index",
                "p22009_a1", "p22009_a2", "p22009_a3", "p22009_a4", "p22009_a5"]
        t_r = t[cols].dropna().copy()

        # Push to R
        ro.globalenv["r_df"] = pandas2ri.py2rpy(t_r)
        ro.globalenv["covariate_cols"] = ro.StrVector(cols[2:])

        # Run Fine-Gray in R
        res = ro.r('''
            library(cmprsk)
            f <- as.formula(paste("~", paste(covariate_cols, collapse = " + ")))
            X <- model.matrix(f, data = r_df)[, -1]
            X <- X[, qr(X)$pivot[1:qr(X)$rank]]
            fg_model <- crr(
                ftime = r_df$duration,
                fstatus = r_df$event_type,
                cov1 = X,
                failcode = 1,
                cencode = 0
            )
            z_scores <- fg_model$coef / sqrt(diag(fg_model$var))
            p_vals <- 2 * (1 - pnorm(abs(z_scores)))
            log10p <- -log10(p_vals)
            list(
                coef = fg_model$coef,
                se = sqrt(diag(fg_model$var)),
                z = z_scores,
                p = p_vals,
                HR = exp(fg_model$coef),
                CI_lower = exp(fg_model$coef - 1.96 * sqrt(diag(fg_model$var))),
                CI_upper = exp(fg_model$coef + 1.96 * sqrt(diag(fg_model$var))),
                log10p = log10p
            )
        ''')

        # Save results
        dic_fg_results[var] = {
            "coef": np.array(res.rx2('coef'))[0],
            "HR": np.array(res.rx2('HR'))[0],
            "se": np.array(res.rx2('se'))[0],
            "z": np.array(res.rx2('z'))[0],
            #"p": float(res.rx2('p')[0]),
            "p": "{:.16e}".format(res.rx2('p')[0]),
            "log10_p": float(res.rx2('log10p')[0]),
            "CI_lower": np.array(res.rx2('CI_lower'))[0],
            "CI_upper": np.array(res.rx2('CI_upper'))[0],
            "N": t_r.shape[0],
            "N_pairs": int(((t_r[var] == 1) & (t_r["event_type"] == 1)).sum())
        }

    except Exception as e:
        print(f"Error in {var}: {e}")


In [None]:
# Build summary table for Fine-Gray model
pd_table_fg = {}

for var, res in dic_fg_results.items():
    if var[1:] not in icd10_conversion:
        continue  # Skip if not in mapping

    name = icd10_conversion[var[1:]][1]
    code = icd10_conversion[var[1:]][0]

    pd_table_fg[name] = [
        var,
        code,
        "AD",
        name,
        res['HR'][0] if isinstance(res['HR'], (list, np.ndarray)) else res['HR'],
        res['CI_lower'][0] if isinstance(res['CI_lower'], (list, np.ndarray)) else res['CI_lower'],
        res['CI_upper'][0] if isinstance(res['CI_upper'], (list, np.ndarray)) else res['CI_upper'],
        res['p'],  # scientific notation string
        res['log10_p'],              # added field
        res['N_pairs'],
        res['N']
    ]
    
output_fg = pd.DataFrame.from_dict(
    pd_table_fg,
    orient='index',
    columns=('code', 'ICD10_CODE', 'NDD', 'Description', 'HR', 'ci_min', 'ci_max', 'P_VAL','log10_p', 'N_pairs', 'n')
)

# Convert p-values to float
p_values = output_fg['P_VAL'].astype(float).values

# Apply FDR correction
rejected, pvals_corrected = fdrcorrection(p_values)

# Add corrected p-values to the dataframe
output_fg['P_VAL_FDR_CORRECTED'] = pvals_corrected

output_fg.to_csv("fg_results.csv", index=False)

In [None]:
output_fg

## Importing packages

In [None]:
#icd10 to definition conversion
icd10_conversion = pd.read_csv("/mnt/project/aug/icd10_conversion.csv")

# covariates such as date of birth, sex and townsend
covariates = pd.read_csv("/mnt/project/aug/f_covariate.csv")
covariates = covariates.drop(covariates.columns[0], axis=1)

# PC table has the principal components 1,2,3,4,5
PC = pd.read_csv('/mnt/project/PCs.csv')
# H_controls has the list of eids for the healthy participants
H_controls = pd.read_csv('/mnt/project/healthy_control_11_19_23.csv')
H_controls =H_controls[["eid"]].copy()

# attending_info has the date of attending the assessment center, Year of birth and Age_at_recruitment
attending_info = pd.read_csv("/mnt/project/attennding_info.csv")

# Death contains date of Date_of_death for participants
Death = pd.read_csv("/mnt/project/Death_info.csv")

# PRS values for PD
PD_PRS = pd.read_csv("/mnt/project/PD_PRS/PD_PRS_final.sscore",sep= "\t")

# lrkk2 classification
lrkk2 = pd.read_csv("/mnt/project/lrkk2_carriers.csv")
lrkk2.rename(columns={'FID':'eid','12:40220632_C': 'C_12_40220632', '12:40340400_G':'G_12_40340400'},inplace = True)

# GBA1 classification
GBA1 = pd.read_csv("/mnt/project/GBA1_carriers.csv")
GBA1.rename(columns={'FID':'eid','1:155162560_G': 'G_1_155162560', '1:155235843_T': 'T_1_155235843'}, inplace =True)

#df_icd10 has 158 icd10 codes for digestive, endocrine and metabolic disorders
df_icd10 = pd.read_csv("/mnt/project/aug/f_icd10.csv")
df_icd10 = df_icd10.drop(df_icd10.columns[0], axis=1)

In [None]:
icd10_conversion2 = pd.read_csv("/mnt/project/old_data/icd10_conversion_2.csv")

icd10_conversion2.head()

In [None]:
df_icd10

In [None]:
#add the data set for the european descends
#p22006 data field is shows if the person is Caucasian or not , 1 means the person is Caucasian.
# filter out only the one who have p22006 equal to 1
european = pd.read_csv('/mnt/project/Genetic relatedness pairing.csv')

# p22006 data field shows if the person is Caucasian or not , 1 means the person is Caucasian.
# filter out only the one who have p22006 equal to 1
#only keep european individuals p22006 =1
eid_list = european[european['p22006'] == 1]['eid'].tolist()

df_icd10 = df_icd10[df_icd10['eid'].isin(eid_list)]

#relatedness information:
relatedness = pd.read_csv('/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat', sep = ' ')

# this file only has the related individuals. so we just remove them at the 3rd-degree relationships
rel_remove = relatedness[relatedness['Kinship']> 0.0884]
rel_remove.info()

# remove the first column ID1 formt the ICD10 data set
eids_to_remove = rel_remove["ID1"].tolist()
df_icd10 = df_icd10[~df_icd10['eid'].isin(eids_to_remove)]

# rename the FID column
PD_PRS.rename(columns={"#FID":"eid"},inplace = True)

# calculate the recruitement year for each participant:
#By adding the year of birth to the age at recruitment, the operation effectively calculates the year in which each participant was recruited
attending_info['recruit_year'] = attending_info['Year_of_birth'] + attending_info['Age_at_recruitment']

***
The recruitment year will be considered as the start date of the study for each participant.
The end of the study for a participant will be determined by either the onset of AD (Alzheimer's Disease), death, or the date '2023-01-01'
whichever comes first.
***

In [None]:
# convert the covariates codes to meaningful names
covariate_conversio = {'p21022' :"Age_at_recruitment", "p34" : "Year_of_birth", "p52" : 'Month_of_birth', 'p31' : "sex", 'p22189' :'Townsend_deprivation_index'}
covariates = covariates.rename(columns=covariate_conversio)

In [None]:
#alzheimer's 131036
#parkinson's disease 131022

#only keep Europeans in the control files:
H_controls = H_controls[H_controls['eid'].isin(eid_list)]
#remove the related individuals form the control file:
H_controls = H_controls[~H_controls['eid'].isin(eids_to_remove)]
#add the icd10 information to healthy data frame
H_controls = H_controls.merge(df_icd10, on='eid')

In [None]:
# copy the data set
PD_merged_data  =  df_icd10.copy()

## only AD cases
# Remove rows where column 'p131036' has value 0
PD_merged_data = PD_merged_data[~PD_merged_data['p131022'].isna()]


In [None]:

# Create a table including the columns for the year of recruitment, the date of Alzheimer's Disease diagnosis (p131036), and the date of death.

attending_info = attending_info[['eid','recruit_year']]
df = PD_merged_data[['eid','p131022']]
H_controls = H_controls[['eid','p131022']]
# Concatenate the DataFrames on top of each other
df = pd.concat([df, H_controls], ignore_index=True)
df = pd.merge(df, attending_info, on='eid')
Death = Death[['eid','p40000_i0_Date_of_death']]
df = pd.merge(Death, df, on='eid', how='right')

In [None]:
df

In [None]:
# Convert date columns to datetime
df['p40000_i0_Date_of_death'] = pd.to_datetime(df['p40000_i0_Date_of_death'])
df['p131022'] = pd.to_datetime(df['p131022'])

In [None]:
# Set study end date to Jan 1, 2023
study_end = pd.Timestamp('2023-01-01')

In [None]:
# Define stop date: earliest of AD, death, or study end
df['stop'] = df[['p131022', 'p40000_i0_Date_of_death']].min(axis=1)
df['stop'] = df['stop'].fillna(study_end)

In [None]:
# Keep original full stop date (if needed)
df['complete_stop_date'] = df['stop']

In [None]:
# Set start = recruit year
df.rename(columns={'recruit_year': 'start'}, inplace=True)

In [None]:

# Extract year from datetime columns
df['stop'] = df['stop'].dt.year
df['p131022_year'] = df['p131022'].dt.year

In [None]:
df

In [None]:

# Drop invalid entries: stop before start
df = df[df['stop'] > df['start']]

In [None]:
# Drop rows where AD happened before recruitment
df = df[(df['start'] <= df['p131022_year']) | df['p131022_year'].isna()]

In [None]:
df[(df['p131022_year'].isna()) | (df['p131022_year'] <= df['stop'])]


In [None]:

# Calculate duration
df['duration'] = np.where(
    df['p131022_year'].isnull(),
    df['stop'] - df['start'],
    df['p131022_year'] - df['start']
)


In [None]:
# Define event_type for Fine and Gray
# 0 = censored, 1 = PD, 2 = death before PD
df['event_type'] = 0
df.loc[df['p131022'].notna(), 'event_type'] = 1
df.loc[df['p131022'].isna() & df['p40000_i0_Date_of_death'].notna(), 'event_type'] = 2

In [None]:
#add the prs values to the table
df = pd.merge(df, PD_PRS[["eid","SCORE1_AVG"]], on = "eid")  



In [None]:
# Calculate mean and SD for controls (where event == 0)
mean_controls = df["SCORE1_AVG"][df["event_type"] == 0].mean()
sd_controls = df["SCORE1_AVG"][df["event_type"] == 0].std()

# Compute the z-score for SCORE1_AVG
df["zSCORE"] = (df["SCORE1_AVG"] - mean_controls) / sd_controls



In [None]:
# get the list of icd10 codes
list_of_icd10 = df_icd10.columns.tolist()[1:]
# remove pd and ad
list_of_icd10.remove("p131022")
list_of_icd10.remove("p131036")


In [None]:
cutoff_date = pd.to_datetime("1999-01-01")

# Load your preprocessed df, df_icd10, PC, APOE, and covariates
# Example:
# df = pd.read_csv("df.csv")
# df_icd10 = pd.read_csv("df_icd10.csv")
# PC = pd.read_csv("PC.csv")
# APOE = pd.read_csv("APOE.csv")
# covariates = pd.read_csv("covariates.csv")

# list_of_icd10 = ['p130008', 'p130010', ...]

In [None]:
dic_fg_results = {}
cutoff_date = pd.Timestamp("1999-01-01")


for var in list_of_icd10:

    print(f"Running Fine-Gray for {var}")
    
    try:
        # Merge all required data
        t = pd.merge(df[["eid", "duration", "event_type", "p131022", "start"]],
                     df_icd10[["eid", var]], on="eid")
        t = pd.merge(t, covariates[["eid", "Age_at_recruitment", "Townsend_deprivation_index", "sex"]], on="eid")
        t = pd.merge(t, PC, on="eid")
        

        # # Calculate age at recruitment
        # t["Age_at_recruitment"] = t["start"] - t["Year_of_birth"]

        # Convert and filter dates
        t["p131022"] = pd.to_datetime(t["p131022"])
        t[var] = pd.to_datetime(t[var])
        t = t[~(t[var] < cutoff_date)]
        t = t[~(t["p131022"] < cutoff_date)]

        # ICD10 after AD → NaT
        t.loc[t[var] >= t["p131022"], var] = pd.NaT

        # Binary ICD10 presence
        t[var] = (~t[var].isna()).astype(int)

        # Skip if too few cases or AD events
        if t[var].sum() < 3 or t[t[var] == 1]["event_type"].eq(1).sum() <= 1:
            print(f"Skipping {var} (too few cases)")
            continue

        # Select columns
        cols = ["duration", "event_type", var, "Age_at_recruitment", "sex", "Townsend_deprivation_index",
                "p22009_a1", "p22009_a2", "p22009_a3", "p22009_a4", "p22009_a5"]
        t_r = t[cols].dropna().copy()

        # Push to R
        ro.globalenv["r_df"] = pandas2ri.py2rpy(t_r)
        ro.globalenv["covariate_cols"] = ro.StrVector(cols[2:])

        # Run Fine-Gray in R
        res = ro.r('''
            library(cmprsk)
            f <- as.formula(paste("~", paste(covariate_cols, collapse = " + ")))
            X <- model.matrix(f, data = r_df)[, -1]
            X <- X[, qr(X)$pivot[1:qr(X)$rank]]
            fg_model <- crr(
                ftime = r_df$duration,
                fstatus = r_df$event_type,
                cov1 = X,
                failcode = 1,
                cencode = 0
            )
            z_scores <- fg_model$coef / sqrt(diag(fg_model$var))
            p_vals <- 2 * (1 - pnorm(abs(z_scores)))
            log10p <- -log10(p_vals)
            list(
                coef = fg_model$coef,
                se = sqrt(diag(fg_model$var)),
                z = z_scores,
                p = p_vals,
                HR = exp(fg_model$coef),
                CI_lower = exp(fg_model$coef - 1.96 * sqrt(diag(fg_model$var))),
                CI_upper = exp(fg_model$coef + 1.96 * sqrt(diag(fg_model$var))),
                log10p = log10p
            )
        ''')

        # Save results
        dic_fg_results[var] = {
            "coef": np.array(res.rx2('coef'))[0],
            "HR": np.array(res.rx2('HR'))[0],
            "se": np.array(res.rx2('se'))[0],
            "z": np.array(res.rx2('z'))[0],
            #"p": float(res.rx2('p')[0]),
            "p": "{:.16e}".format(res.rx2('p')[0]),
            "log10_p": float(res.rx2('log10p')[0]),
            "CI_lower": np.array(res.rx2('CI_lower'))[0],
            "CI_upper": np.array(res.rx2('CI_upper'))[0],
            "N": t_r.shape[0],
            "N_pairs": int(((t_r[var] == 1) & (t_r["event_type"] == 1)).sum())
        }

    except Exception as e:
        print(f"Error in {var}: {e}")


In [None]:
# Build summary table for Fine-Gray model
pd_table_fg = {}

for var, res in dic_fg_results.items():
    if var[1:] not in icd10_conversion:
        continue  # Skip if not in mapping

    name = icd10_conversion[var[1:]][1]
    code = icd10_conversion[var[1:]][0]

    pd_table_fg[name] = [
        var,
        code,
        "PD",
        name,
        res['HR'][0] if isinstance(res['HR'], (list, np.ndarray)) else res['HR'],
        res['CI_lower'][0] if isinstance(res['CI_lower'], (list, np.ndarray)) else res['CI_lower'],
        res['CI_upper'][0] if isinstance(res['CI_upper'], (list, np.ndarray)) else res['CI_upper'],
        res['p'],  # scientific notation string
        res['log10_p'],              # added field
        res['N_pairs'],
        res['N']
    ]

    
output_fg = pd.DataFrame.from_dict(
    pd_table_fg,
    orient='index',
    columns=('code', 'ICD10_CODE', 'NDD', 'Description', 'HR', 'ci_min', 'ci_max', 'P_VAL','log10_p', 'N_pairs', 'n')
)

# Convert p-values to float
p_values = output_fg['P_VAL'].astype(float).values

# Apply FDR correction
rejected, pvals_corrected = fdrcorrection(p_values)

# Add corrected p-values to the dataframe
output_fg['P_VAL_FDR_CORRECTED'] = pvals_corrected

output_fg.to_csv("fg_PD_results.csv", index=False)