<div class="alert alert-block alert-info">
    
    
<li> Main node: 16CPUs, 60GB RAM, 120 GB Disk</li>

<li> Time and Cost: \$0.77/h, ~4 hr, \$3.08</li>

</div>

In [1]:
import tqdm
from datetime import datetime, timedelta, timezone
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
import math
from IPython.display import display, HTML
from datetime import date
import multiprocessing

import os
import statsmodels.api as sm

import pickle

In [3]:
# Update to sex_specific phecodes 

class PheWAS_Pool:
    """
    Class for performing PheWAS
    ======================================================================================================
    phecode_counts: Pandas Dataframe of Phecodes 
    covariates: Pandas Dataframe of covariates to include in the analysis
    indep_var: String indicating the column in covariates that is the independent variable of interest
    CDR_version: String indicating CDR version
    phecode_process: list for phecodes to process
    min_cases: minimum number of cases for an individual phenotype to be analyzed
    cores: if not "", then specify number of cores to use in the analysis 
    """
    def __init__(self, phecode_counts,
                 covariates, 
                 indep_var_of_interest="", 
                 CDR_version='R2019Q4R3',
                 phecode_process = 'all', 
                 min_cases = 100,
                 independent_var_names=["AF","white","Asian","male","age_at_last_event",
                                        "ehr_length","code_cnt", "unk_sex", "race_unk", "hisp_lat","unk_eth"],
                 genderspec_independent_var_names=["AF","white","Asian","age_at_last_event","unk_sex","male",
                                                   "ehr_length","code_cnt","race_unk","hisp_lat","unk_eth"],
                 show_res = False,
                 cores=""):
        print("~~~~~~~~~~~~~~~        Creating PheWAS AOU Object           ~~~~~~~~~~~~~~~~~~~~~")
        # create instance attributes
        self.indep_var_of_interest = indep_var_of_interest
        #update 09_5_2019: only process phecodes passed in phecode counts
        if phecode_process =='all':
            self.phecode_list = phecode_counts["cid"].unique().tolist()
        else:
            self.phecode_list = phecode_process
        self.CDR_version = CDR_version
        self.cores = cores 
        print("~~~~~~~~~~~~~~~       Merging Phecodes and Covariates       ~~~~~~~~~~~~~~~~~~~~~")
        self.demo_patients_phecodes = pd.merge(covariates,phecode_counts, on = ["person_id"])
        self.show_res = show_res
        self.independent_var_names = independent_var_names
        self.independent_var_names= list(np.append(np.array([self.indep_var_of_interest]),self.independent_var_names))  
        self.genderspec_independent_var_names = genderspec_independent_var_names
        self.genderspec_independent_var_names= list(np.append(np.array([self.indep_var_of_interest]),self.genderspec_independent_var_names))  
        self.remove_dup = list(np.append(np.array(["person_id"]),self.independent_var_names))
        self.min_cases = min_cases 
        
    def runPheLogit(self, phecodes): 
        # placeholder for object
        temp=0

In [5]:
def runPheLogit(self, phecodes): 
        # diagnostics
        self.return_dict = {}
        for phecode in phecodes:
                # First, need to define the exclusions, sufficient to just use the ICD9 one since the overall phecodes 
                # are the same 
                #phecode_exclusions=ICD9_exclude[ICD9_exclude["code"]==phecode]["exclusion_criteria"].unique().tolist()
                
                ## we need to do sex specific counting here.
                ## First find all people with at least 2 of the phecode
                cases=self.demo_patients_phecodes[(self.demo_patients_phecodes["phecode"]==phecode)
                                                 ]                
                ## now determine if there is a sex specific restriction 
                ## this is convoluted, but it is written this way to avoid storing another copy of all the data in memory

              
                
                analysis_independent_var_names=self.independent_var_names
                # Now cases have been properly modified and so we remove any duplicates and restrict the analysis to just the regressors 
                cases=cases[self.remove_dup].drop_duplicates()[analysis_independent_var_names]

                # Now test to see if we have enough cases
                # This is written like this to avoid unnecessary compute for phecodes 
                # for which we don't have enough cases

                if cases.shape[0]>=self.min_cases:
                    # if it passes
                    # create set all people that need to be excluded
                    exclude=self.demo_patients_phecodes[(self.demo_patients_phecodes["cid"]==phecode)]#&(control["concept_code"]<=2)]
                    

                    control=self.demo_patients_phecodes[(self.demo_patients_phecodes.person_id.isin(exclude.person_id)==False)]
                    # Now controls have been properly modified and so we remove any duplicates and restrict the analysis to just the regressors 
                    control=control[self.remove_dup].drop_duplicates()[analysis_independent_var_names]
                    ############################################################################################
                    ## Perform Logistic regression
                    ## Now run through the logit function from stats models 
                    ############################################################################################
                    celiac_cases = cases[cases['celiac']==1]
                    celiac_control = control[control['celiac']==1]
                    y=[1]*cases.shape[0]+[0]*control.shape[0]
                    regressors=pd.concat([cases,control])
                    regressors=sm.tools.add_constant(regressors)
                    logit = sm.Logit(y, regressors, missing = 'drop')
                    result = logit.fit(disp=False,method='bfgs')
                    # choose to see results on the fly
                    if self.show_res == True:
                        print(result.summary())
                    else:
                        pass
                    # return 
                    results_as_html = result.summary().tables[0].as_html()
                    converged = pd.read_html(results_as_html)[0].iloc[5,1]
                    # now other quants 
                    results_as_html = result.summary().tables[1].as_html()
                    res = pd.read_html(results_as_html, header = 0, index_col=0)[0]
                    p_value =  result.pvalues[self.indep_var_of_interest]
                    beta_ind = result.params[self.indep_var_of_interest]
                    conf_int_1=res.loc[self.indep_var_of_interest]['[0.025']
                    conf_int_2=res.loc[self.indep_var_of_interest]['0.975]']
                    OR=np.exp(beta_ind)
                    OR_conf_int_1=np.exp(res.loc[self.indep_var_of_interest]['[0.025'])
                    OR_conf_int_2=np.exp(res.loc[self.indep_var_of_interest]['0.975]'])
                    self.return_dict[phecode] =[phecode,cases.shape[0],control.shape[0],celiac_cases.shape[0],celiac_control.shape[0],p_value, 
                                                beta_ind, conf_int_1, conf_int_2, OR, OR_conf_int_1,OR_conf_int_2, converged]
                #    print(control.shape[0])
                else:
                    error = "Error in Phecode: "+str(phecode)+ ": Number of cases less than minimum of "+str(self.min_cases)
               # del [control, cases, regressors]
        print(phecode)

In [6]:
with open('phewas_pool_v2.pkl', 'rb') as inp:
    test_pool = pickle.load(inp)


In [None]:
#define logit function
test_pool.runPheLogit = runPheLogit
test_pool.show_res = False

In [8]:
#Phewas for 1763 conditions; takes 4 hours
test_pool.runPheLogit(test_pool,test_pool.phecode_list)

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))

KeyboardInterrupt



In [None]:
#annotate phewas results
logit_Phecode_results=[test_pool.return_dict[k] for k in test_pool.return_dict.keys()]
logit_Phecode_results=pd.DataFrame(logit_Phecode_results)
logit_Phecode_results.columns=["phecode","cases","control","cases with CeD", "controls with CeD","p_value","beta_ind","conf_int_1", "conf_int_2", "converged"]
logit_Phecode_results['OR']=np.exp(logit_Phecode_results.beta_ind)
logit_Phecode_results['OR 95% upper']=np.exp(logit_Phecode_results.conf_int_1)
logit_Phecode_results['OR 95% lower']=np.exp(logit_Phecode_results.conf_int_2)
logit_Phecode_results["code_val"] = logit_Phecode_results["phecode"]
logit_Phecode_results["neg_p_log_10"] = -np.log10(logit_Phecode_results["p_value"])
logit_Phecode_results.to_csv(f'{my_bucket}/data/phewas/logit_phecode_results.csv',index=False)