# UKB MODELS

In [None]:
!pip install statsmodels
!pip install lifelines==0.26.4

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection
from lifelines import CoxPHFitter

import warnings
warnings.filterwarnings("ignore")

In [None]:
!dx download -r 'data/files_for_cox/'

In [None]:
#Set variables
STUDY_ENDS = '2023-09-30'
STUDY_START = '1999-01-01'

In [None]:
#Get list of people with Welsh data -- datafield 26426
w = pd.read_csv('files_for_cox/Wales_dep_n21182_participant.csv')
wales_list = list(w['eid'])

## Timeline 1999-2023

In [None]:
#Get list of codes from prep notebook
ndd_list = ['AD', 'DEM', 'PD', 'VAS', 'MS', 'ALS']
codes = ['F51.4_DATE', 'F51.8_DATE', 'F51.3_DATE', 'F51.5_DATE', 'F51.9_DATE', 'F51.0_DATE', 'F51_DATE', 'G47.9_DATE', 'G47.8_DATE', 'G47.1_DATE', 'G47.4_DATE', 'G47.0_DATE', 'G47.3_DATE', 'G47.2_DATE', 'G47_DATE']

In [None]:
timeline = '1999-2023'
model = 'COX'
lag = "0"

results = []

for ndd in ndd_list:
    
    #Load df
    df = pd.read_csv(f'files_for_cox/{ndd}_with_tenure_lags_45.csv', parse_dates = True)
    
    #Include this line in to exclude Welsh data
    #df = df[~df['ID'].isin(wales_list)]
    
    #Only selected codes with data
    codes_with_data = []
    for code in codes:
        m = df[['TOWNSEND', 'AGE_OF_RECRUIT', 'GENETIC_SEX', 'tenure', ndd, f'QC{lag}_' +code]]
        n=sum(m[f'QC{lag}_'+code])
        df_pair = m[m[f'QC{lag}_'+code]==1]
        n_pairs = sum(df_pair[ndd])
        if n == 0:
            pass
        elif n_pairs < 5:
            pass
        elif n == n_pairs:
            covariate = code
            HR = np.nan
            ci_min = np.nan
            ci_max = np.nan
            p = np.nan
            print(covariate, ndd, HR, ci_min, ci_max, p, n_pairs, n)
            results.append((covariate, ndd, model, timeline, lag, HR, ci_min, ci_max, p, n_pairs, n))
            pass
        else:
            codes_with_data.append(code)
            
    for code in codes_with_data:
        m = df[['TOWNSEND', 'AGE_OF_RECRUIT', 'GENETIC_SEX', 'tenure', ndd, f'QC{lag}_' +code]]
        n=sum(m[f'QC{lag}_'+code])
        df_pair = m[m[f'QC{lag}_'+code]==1]
        n_pairs = sum(df_pair[ndd])
        
        cph = CoxPHFitter()
        cph.fit(m, duration_col = 'tenure', event_col = ndd, show_progress=False, step_size = 0.1)
        #cph.print_summary()
        #cph.plot()
        
        actual_p = cph._compute_p_values()
        results_df = cph.summary
        results_df = results_df.reset_index()
        test = results_df.iloc[3]

        covariate = code
        HR = test['exp(coef)']
        ci_min = test['exp(coef) lower 95%']
        ci_max = test['exp(coef) upper 95%']
        p = actual_p[3]

        print(covariate, ndd, HR, ci_min, ci_max, p, n_pairs, n)
        results.append((covariate, ndd, model, timeline, lag, HR, ci_min, ci_max, p, n_pairs, n))
            
cox1 = pd.DataFrame(results, columns=('PRIOR','OUTCOME', 'MODEL','TIMELINE', 'LAG', 'HR', 'ci_min', "ci_max", 'P_VAL', "N_pairs", "N"))

In [None]:
#Combine results
output = pd.concat([cox1])

#Adding FDR Correction

#Sort P-values
output = output.sort_values(by = "P_VAL")

#Drop Nan-values
output = output.dropna()

#FDR Correction
rejected, p_corr = fdrcorrection(output['P_VAL'], is_sorted=True)
output['P_CORR'] = p_corr
output['REJECTED'] = rejected

output

In [None]:
codes = pd.read_csv('files_for_cox/sleep_ICD10_codes_with_description.csv')
codes['DATE'] = codes['ICD10'] + '_DATE'
codes

In [None]:
final_results = output.merge(codes, left_on='PRIOR', right_on = 'DATE', how='left')
final_results['COHORT'] = 'UKB'
final_results = final_results[['ICD10', 'Description', 'OUTCOME', 'COHORT', 'HR', 'ci_min', 'ci_max', 'P_VAL',
       'N_pairs', 'N', 'P_CORR', 'REJECTED']]
final_results = final_results.rename(columns = {'Description':'PRIOR', 'REJECTED':'SIGNIFICANT'})
final_results

In [None]:
final_results.to_csv(f'SLEEP_Supplementary_Table_2_45_no_wales.csv', header = True, index = False)

In [None]:
!dx upload SLEEP_Supplementary_Table_2_45_no_wales.csv --path /results/SLEEP_Supplementary_Table_2_45_no_wales.csv

## Pull out significant and replicated results

In [None]:
replicated_list = ['G47', 'F51', 'G47.3']
fr = final_results[final_results['SIGNIFICANT'] == True]
fr = fr[fr['ICD10'].isin(replicated_list)]
fr = fr.sort_values(by = 'OUTCOME')
fr