In [0]:
##########################################################
## Import necessary packages
######################################################
import numpy as np
import pandas as pd
from tableone import TableOne

#########################################################################################################
# Please define the target Timestamp for the end date for the generation of COVID patients dataset
# Notice: Format: YYYY-MM-DD
# Current set to 2021-12-10
############################################################################################################
# check to make sure they are consistent
# Previous dates used:
# 1. "2021-12-31", "20211231"
# 2. "2022-02-18", "20220218"
# 3. end_date, file_date = "2021-12-25", "20211225"

start_date, end_date, file_date = "2020-03-01", "2021-12-25", "20211225_Lancet_bmi"
# register it so can be directly used in the following SQL statement
spark.conf.set("enddate.var", end_date)

In [0]:
# imids_training_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_vax_cond_med_geo_r5_{}""".format(file_date))

imids_training_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_vax_{}""".format(file_date))
imids_training_df = imids_training_df.dropDuplicates()

# imids_training_df.limit(5).toPandas()

In [0]:
imids_training_df = imids_training_df.drop("pat_id").withColumnRenamed('race1', 'race')

imids_training_df = imids_training_df.select('patient_id', 'results', 'hospitalized_after_positive','IMV_after_positive','death_after_positive',
  'age', 'age_range', 'BMI', 'ethnicity', 'race', 'CVX_name', 'sex', 'Vaccination_status',
  'hypertension', 'diabetes_type1and2', 'atrial_fibrillation', 'coronary_artery_disease', 'heart_failure', 'chronic_kidney_disease', 'copd', 'obesity', 'chronic_liver_disease', 'malignant_neoplastic_disease', 'asthma', 'HIV', 'history_transplant', 'stroke', 'opioid_dependence',
 'ibd', 'rheumatoid_arthritis', 'multiple_sclerosis','psoriatic_arthritis', 'psoriasis', 'systemic_sclerosis', 'spondyloarthritis', 'systemic_lupus', 'vasculitis', 'sarcoidosis', 'APS', 'sjogren_syndrome'
 )

In [0]:
imids_training_pd_df = imids_training_df.toPandas()

## change the patient id col to be the index of the training data
imids_training_pd_df = imids_training_pd_df.set_index("patient_id")
display(imids_training_pd_df)

results,hospitalized_after_positive,IMV_after_positive,death_after_positive,age,age_range,BMI,ethnicity,race,CVX_name,sex,Vaccination_status,hypertension,diabetes_type1and2,atrial_fibrillation,coronary_artery_disease,heart_failure,chronic_kidney_disease,copd,obesity,chronic_liver_disease,malignant_neoplastic_disease,asthma,HIV,history_transplant,stroke,opioid_dependence,ibd,rheumatoid_arthritis,multiple_sclerosis,psoriatic_arthritis,psoriasis,systemic_sclerosis,spondyloarthritis,systemic_lupus,vasculitis,sarcoidosis,APS,sjogren_syndrome
Negative,0,0,0,60.0,50-74,38.60639276308138,Not Hispanic,AIAN,,Female,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,15.0,0-17,22.38172339066572,Hispanic,Other,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,62.0,50-74,28.7594659571033,Not Hispanic,Other,Moderna COVID-19 Vaccine,Female,Fully,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,70.0,50-74,30.95167695660835,Not Hispanic,White,Pfizer-BioNTech COVID-19 Vaccine,Male,Booster,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,75.0,75+,48.18129442762514,Unknown,Unknown,Janssen COVID-19 Vaccine,Male,Booster,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,36.0,18-49,17.799215345266138,Not Hispanic,White,,Male,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,17.0,0-17,27.284075066537667,Hispanic,Other,Pfizer-BioNTech COVID-19 Vaccine,Female,Fully,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,3.0,0-17,12.19385307520615,Not Hispanic,Black,,Male,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,33.0,18-49,23.411246822493645,Not Hispanic,Asian,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,31.0,18-49,29.694543915541384,Not Hispanic,White,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
## Find the non_IMID_df
non_IMID_df = imids_training_pd_df.loc[( (imids_training_pd_df["rheumatoid_arthritis"] == 0)&(imids_training_pd_df["spondyloarthritis"] == 0)&(imids_training_pd_df["systemic_lupus"] == 0)\
                                                  &(imids_training_pd_df["psoriatic_arthritis"] == 0)&(imids_training_pd_df["systemic_sclerosis"] == 0)\
                                      &(imids_training_pd_df["psoriasis"] == 0)&(imids_training_pd_df["ibd"] == 0)&(imids_training_pd_df["multiple_sclerosis"] == 0)\
                                                  &(imids_training_pd_df["vasculitis"] == 0)&(imids_training_pd_df["sarcoidosis"] == 0)&(imids_training_pd_df["APS"] == 0)\
                                       &(imids_training_pd_df["sjogren_syndrome"] == 0)\
                                      )]
## Find the IMID_df
IMID_df = imids_training_pd_df.loc[( (imids_training_pd_df["rheumatoid_arthritis"] == 1)|(imids_training_pd_df["spondyloarthritis"] == 1)|(imids_training_pd_df["systemic_lupus"] == 1)\
                                                  |(imids_training_pd_df["psoriatic_arthritis"] == 1)|(imids_training_pd_df["systemic_sclerosis"] == 1)\
                                      |(imids_training_pd_df["psoriasis"] == 1)|(imids_training_pd_df["ibd"] == 1)|(imids_training_pd_df["multiple_sclerosis"] == 1)\
                                                  |(imids_training_pd_df["vasculitis"] == 1)|(imids_training_pd_df["sarcoidosis"] == 1)|(imids_training_pd_df["APS"] == 1)\
                                       |(imids_training_pd_df["sjogren_syndrome"] == 1)\
                                      )]

In [0]:
# Create the empty IMID count table
pos_table_df = pd.DataFrame(columns=["Outcome", "IMID", "Non_IMID"], index=range(2))
pos_table_df['Outcome'] = ["Positive", "Negative"]

pos_table_df = pos_table_df.astype(str)
# display(pos_table_df)

pos_IMID_df = IMID_df.loc[(IMID_df["results"] == "Positive")]
pos_nonIMID_df = non_IMID_df.loc[(non_IMID_df["results"] == "Positive")]

## Calculate the pospitalized part
row_num = 0
pos_table_df.iloc[row_num, pos_table_df.columns.get_loc("IMID")] = str(len(pos_IMID_df.index))
pos_table_df.iloc[row_num, pos_table_df.columns.get_loc("Non_IMID")] = str(len(pos_nonIMID_df.index))

## Calculate the non pos part
row_num = 1
pos_table_df.iloc[row_num, pos_table_df.columns.get_loc("IMID")] = str(len(IMID_df.index) - len(pos_IMID_df.index))
pos_table_df.iloc[row_num, pos_table_df.columns.get_loc("Non_IMID")] = str(len(non_IMID_df.index) - len(pos_nonIMID_df.index))

display(pos_table_df)

# make column index
pos_table_df.set_index('Outcome', inplace=True)

Outcome,IMID,Non_IMID
Positive,8070,161923
Negative,75427,1271875


In [0]:
import scipy.stats as stats

## Conver the table to a numpy arrary
pos_np_array = pos_table_df.to_numpy(dtype='uint64')

# performing fishers exact test on the data
odd_ratio, p_value = stats.fisher_exact(pos_np_array)

print('odd ratio is : ' + str(odd_ratio))
print('p_value is : ' + str(p_value))

In [0]:
imids_pos_df = imids_training_pd_df.loc[(imids_training_pd_df["results"] == "Positive")]
display(imids_pos_df)

results,hospitalized_after_positive,IMV_after_positive,death_after_positive,age,age_range,BMI,ethnicity,race,CVX_name,sex,Vaccination_status,hypertension,diabetes_type1and2,atrial_fibrillation,coronary_artery_disease,heart_failure,chronic_kidney_disease,copd,obesity,chronic_liver_disease,malignant_neoplastic_disease,asthma,HIV,history_transplant,stroke,opioid_dependence,ibd,rheumatoid_arthritis,multiple_sclerosis,psoriatic_arthritis,psoriasis,systemic_sclerosis,spondyloarthritis,systemic_lupus,vasculitis,sarcoidosis,APS,sjogren_syndrome
Positive,0,0,0,17.0,0-17,27.284075066537667,Hispanic,Other,Pfizer-BioNTech COVID-19 Vaccine,Female,Fully,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,36.0,18-49,34.0892345261253,Not Hispanic,White,Pfizer-BioNTech COVID-19 Vaccine,Female,Booster,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,29.0,18-49,24.902644681653808,Not Hispanic,White,Moderna COVID-19 Vaccine,Female,Fully,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,56.0,50-74,24.96102980371641,Not Hispanic,Black,Moderna COVID-19 Vaccine,Female,Fully,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,20.0,18-49,28.50080197114816,Hispanic,Other,Pfizer-BioNTech COVID-19 Vaccine,Male,Fully,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,1,0,0,66.0,50-74,27.435054870109735,Not Hispanic,AIAN,,Female,,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,30.0,18-49,25.79306413050696,Not Hispanic,White,,Male,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,1,0,0,36.0,18-49,32.76493765597761,Hispanic,Other,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,1,0,0,66.0,50-74,29.345224520898874,Not Hispanic,White,Moderna COVID-19 Vaccine,Female,Fully,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,1,0,0,76.0,75+,30.865702475349597,Not Hispanic,White,,Male,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
## Find the non_IMID_df
non_IMID_df = imids_pos_df.loc[( (imids_pos_df["rheumatoid_arthritis"] == 0)&(imids_pos_df["spondyloarthritis"] == 0)&(imids_pos_df["systemic_lupus"] == 0)\
                                                  &(imids_pos_df["psoriatic_arthritis"] == 0)&(imids_pos_df["systemic_sclerosis"] == 0)\
                                      &(imids_pos_df["psoriasis"] == 0)&(imids_pos_df["ibd"] == 0)&(imids_pos_df["multiple_sclerosis"] == 0)\
                                                  &(imids_pos_df["vasculitis"] == 0)&(imids_pos_df["sarcoidosis"] == 0)&(imids_pos_df["APS"] == 0)\
                                       &(imids_pos_df["sjogren_syndrome"] == 0)\
                                      )]
## Find the IMID_df
IMID_df = imids_pos_df.loc[( (imids_pos_df["rheumatoid_arthritis"] == 1)|(imids_pos_df["spondyloarthritis"] == 1)|(imids_pos_df["systemic_lupus"] == 1)\
                                                  |(imids_pos_df["psoriatic_arthritis"] == 1)|(imids_pos_df["systemic_sclerosis"] == 1)\
                                      |(imids_pos_df["psoriasis"] == 1)|(imids_pos_df["ibd"] == 1)|(imids_pos_df["multiple_sclerosis"] == 1)\
                                                  |(imids_pos_df["vasculitis"] == 1)|(imids_pos_df["sarcoidosis"] == 1)|(imids_pos_df["APS"] == 1)\
                                       |(imids_pos_df["sjogren_syndrome"] == 1)\
                                      )]

## P1: Hospitalization

In [0]:
# Create the empty IMID count table
hos_table_df = pd.DataFrame(columns=["Outcome", "IMID", "Non_IMID"], index=range(2))
hos_table_df['Outcome'] = ["Hospitalized", "Not_hospitalized"]

hos_table_df = hos_table_df.astype(str)
display(hos_table_df)

Outcome,IMID,Non_IMID
Hospitalized,,
Not_hospitalized,,


In [0]:
hospitalized_IMID_df = IMID_df.loc[(IMID_df["hospitalized_after_positive"] == 1)]
hospitalized_nonIMID_df = non_IMID_df.loc[(non_IMID_df["hospitalized_after_positive"] == 1)]

## Calculate the hospitalized part
row_num = 0
hos_table_df.iloc[row_num, hos_table_df.columns.get_loc("IMID")] = str(len(hospitalized_IMID_df.index))
hos_table_df.iloc[row_num, hos_table_df.columns.get_loc("Non_IMID")] = str(len(hospitalized_nonIMID_df.index))

## Calculate the non hos part
row_num = 1
hos_table_df.iloc[row_num, hos_table_df.columns.get_loc("IMID")] = str(len(IMID_df.index) - len(hospitalized_IMID_df.index))
hos_table_df.iloc[row_num, hos_table_df.columns.get_loc("Non_IMID")] = str(len(non_IMID_df.index) - len(hospitalized_nonIMID_df.index))

display(hos_table_df)

# make column index
hos_table_df.set_index('Outcome', inplace=True)

Outcome,IMID,Non_IMID
Hospitalized,1176,22154
Not_hospitalized,6894,139769


In [0]:
import scipy.stats as stats

## Conver the table to a numpy arrary
hos_np_array = hos_table_df.to_numpy(dtype='uint64')

# performing fishers exact test on the data
odd_ratio, p_value = stats.fisher_exact(hos_np_array)

print('odd ratio is : ' + str(odd_ratio))
print('p_value is : ' + str(p_value))

## P2: IMV

In [0]:
# Create the empty IMID count table
IMV_table_df = pd.DataFrame(columns=["Outcome", "IMID", "Non_IMID"], index=range(2))
IMV_table_df['Outcome'] = ["Hospitalized", "Not_hospitalized"]

IMV_table_df = IMV_table_df.astype(str)
# display(hos_table_df)

IMV_IMID_df = IMID_df.loc[(IMID_df["IMV_after_positive"] == 1)]
IMV_nonIMID_df = non_IMID_df.loc[(non_IMID_df["IMV_after_positive"] == 1)]

## Calculate the hospitalized part
row_num = 0
IMV_table_df.iloc[row_num, IMV_table_df.columns.get_loc("IMID")] = str(len(IMV_IMID_df.index))
IMV_table_df.iloc[row_num, IMV_table_df.columns.get_loc("Non_IMID")] = str(len(IMV_nonIMID_df.index))

## Calculate the non hos part
row_num = 1
IMV_table_df.iloc[row_num, IMV_table_df.columns.get_loc("IMID")] = str(len(IMID_df.index) - len(IMV_IMID_df.index))
IMV_table_df.iloc[row_num, IMV_table_df.columns.get_loc("Non_IMID")] = str(len(non_IMID_df.index) - len(IMV_nonIMID_df.index))

display(IMV_table_df)

# make column index
IMV_table_df.set_index('Outcome', inplace=True)

Outcome,IMID,Non_IMID
Hospitalized,51,1021
Not_hospitalized,8019,160902


In [0]:
import scipy.stats as stats

## Conver the table to a numpy arrary
IMV_np_array = IMV_table_df.to_numpy(dtype='uint64')

# performing fishers exact test on the data
odd_ratio, p_value = stats.fisher_exact(IMV_np_array)

print('odd ratio is : ' + str(odd_ratio))
print('p_value is : ' + str(p_value))

## P3: death

In [0]:
# Create the empty IMID count table
death_table_df = pd.DataFrame(columns=["Outcome", "IMID", "Non_IMID"], index=range(2))
death_table_df['Outcome'] = ["Hospitalized", "Not_hospitalized"]

death_table_df = death_table_df.astype(str)
# display(hos_table_df)

death_IMID_df = IMID_df.loc[(IMID_df["death_after_positive"] == 1)]
death_nonIMID_df = non_IMID_df.loc[(non_IMID_df["death_after_positive"] == 1)]

## Calculate the hospitalized part
row_num = 0
death_table_df.iloc[row_num, death_table_df.columns.get_loc("IMID")] = str(len(death_IMID_df.index))
death_table_df.iloc[row_num, death_table_df.columns.get_loc("Non_IMID")] = str(len(death_nonIMID_df.index))

## Calculate the non hos part
row_num = 1
death_table_df.iloc[row_num, death_table_df.columns.get_loc("IMID")] = str(len(IMID_df.index) - len(death_IMID_df.index))
death_table_df.iloc[row_num, death_table_df.columns.get_loc("Non_IMID")] = str(len(non_IMID_df.index) - len(death_nonIMID_df.index))

display(death_table_df)

# make column index
death_table_df.set_index('Outcome', inplace=True)

Outcome,IMID,Non_IMID
Hospitalized,314,4980
Not_hospitalized,7756,156943


In [0]:
import scipy.stats as stats

## Conver the table to a numpy arrary
death_np_array = death_table_df.to_numpy(dtype='uint64')

# performing fishers exact test on the data
odd_ratio, p_value = stats.fisher_exact(death_np_array)

print('odd ratio is : ' + str(odd_ratio))
print('p_value is : ' + str(p_value))