In [0]:
# pip install tableone

In [0]:
##########################################################
## Import necessary packages
######################################################
import numpy as np
import pandas as pd
from tableone import TableOne

#########################################################################################################
# Please define the target Timestamp for the end date for the generation of COVID patients dataset
# Notice: Format: YYYY-MM-DD
# Current set to 2021-12-10
############################################################################################################
# check to make sure they are consistent
# Previous dates used:
# 1. "2021-12-31", "20211231"
# 2. "2022-02-18", "20220218"
# 3. end_date, file_date = "2021-12-25", "20211225"

start_date, end_date, file_date = "2020-03-01", "2021-12-25", "20211225_Lancet_bmi"
# register it so can be directly used in the following SQL statement
spark.conf.set("enddate.var", end_date)

In [0]:
# imids_training_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_vax_cond_med_geo_r5_{}""".format(file_date))

imids_training_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_vax_{}""".format(file_date))

# imids_training_df.limit(5).toPandas()

In [0]:
imids_training_df = imids_training_df.dropDuplicates()
# imids_training_df.limit(5).toPandas()

In [0]:
## Drop not used columns
col_to_drop = ('race_v2', 'ethnicity_race')
imids_training_df = imids_training_df.drop(*col_to_drop)

## rename IMIDs drugs columns
imids_training_df = imids_training_df.withColumnRenamed('prior_91_days_hydroxychloroquine_logic', 'hydroxychloroquine').withColumnRenamed('prior_91_days_methotrexate_logic', 'methotrexate')\
.withColumnRenamed('prior_91_days_leflunomide_teriflunomide_logic', 'leflunomide_teriflunomide').withColumnRenamed('prior_91_days_5_ASAa_logic', '5_ASA')\
.withColumnRenamed('prior_91_days_azathioprine_logic', 'azathioprine').withColumnRenamed('prior_91_days_mercaptopurine_logic', 'mercaptopurine')\
.withColumnRenamed('prior_91_days_mitoxantrone_logic', 'mitoxantrone')\
.withColumnRenamed('prior_91_days_mycophenolate_logic', 'mycophenolate').withColumnRenamed('prior_91_days_calcineurin_inhibitor_logic', 'calcineurin_inhibitor')\
.withColumnRenamed('prior_91_days_TNF_alpha_inhibitor_logic', 'TNF_alpha_inhibitor').withColumnRenamed('prior_91_days_fumarates_logic', 'fumarates')\
.withColumnRenamed('prior_91_days_interferons_logic', 'interferons').withColumnRenamed('prior_91_days_alkylating_agent_logic', 'alkylating_agent')\
.withColumnRenamed('prior_91_days_hydroxyurea_logic', 'hydroxyurea').withColumnRenamed('prior_91_days_dapsone_logic', 'dapsone')\
.withColumnRenamed('prior_91_days_cladribine_logic', 'cladribine').withColumnRenamed('prior_91_days_IL1_inhibitor_logic', 'IL1_inhibitor')\
.withColumnRenamed('prior_91_days_IL6_inhibitor_logic', 'IL6_inhibitor').withColumnRenamed('prior_91_days_IL12_23_inhibitor_logic', 'IL12_23_inhibitor')\
.withColumnRenamed('prior_91_days_IL17_inhibitor_logic', 'IL17_inhibitor')\
.withColumnRenamed('prior_91_days_IL23_inhibitor_logic', 'IL23_inhibitor')\
.withColumnRenamed('prior_91_days_abatacept_logic', 'abatacept').withColumnRenamed('prior_91_days_anti_BLyS_logic', 'anti_BLyS')\
.withColumnRenamed('prior_91_days_S1P_receptor_modulator_logic', 'S1P_receptor_modulator').withColumnRenamed('prior_91_days_JAK_inhibitor_logic', 'JAK_inhibitor')\
.withColumnRenamed('prior_91_days_integrin_inhibitor_logic', 'integrin_inhibitor').withColumnRenamed('prior_91_days_PDE4i_targeted_synthetic_logic', 'PDE4i_targeted_synthetic')\
.withColumnRenamed('prior_91_days_anti_CD20_logic', 'anti_CD20').withColumnRenamed('prior_91_days_anti_CD52_logic', 'anti_CD52')\
.withColumnRenamed('prior_91_days_budesonide_logic', 'budesonide').withColumnRenamed('prior_91_days_systemic_glucocorticoids_logic', 'systemic_glucocorticoids')\
.withColumnRenamed('after_10_days_monoclonal_antibody_covid_19_logic', 'monoclonal_antibody_covid_19')\
.withColumnRenamed('race1', 'race')

In [0]:
imids_training_df = imids_training_df.drop("pat_id")

imids_training_df = imids_training_df.select('patient_id', 'results', 'hospitalized_after_positive','IMV_after_positive','death_after_positive',
  'age', 'age_range', 'BMI', 'ethnicity', 'race', 'CVX_name', 'sex', 'Vaccination_status',
  'hypertension', 'diabetes_type1and2', 'atrial_fibrillation', 'coronary_artery_disease', 'heart_failure', 'chronic_kidney_disease', 'copd', 'obesity', 'chronic_liver_disease', 'malignant_neoplastic_disease', 'asthma', 'HIV', 'history_transplant', 'stroke', 'opioid_dependence',
 'ibd', 'rheumatoid_arthritis', 'multiple_sclerosis','psoriatic_arthritis', 'psoriasis', 'systemic_sclerosis', 'spondyloarthritis', 'systemic_lupus', 'vasculitis', 'sarcoidosis', 'APS', 'sjogren_syndrome',
 'hydroxychloroquine',
 'methotrexate',
 'leflunomide_teriflunomide',
 '5_ASA',
 'azathioprine',
 'mercaptopurine',
 'mitoxantrone',
 'mycophenolate',
 'calcineurin_inhibitor',
 'TNF_alpha_inhibitor',
 'fumarates',
 'interferons',
 'alkylating_agent',
 'hydroxyurea',
 'dapsone',
 'cladribine',
 'IL1_inhibitor',
 'IL6_inhibitor',
 'IL12_23_inhibitor',
 'IL17_inhibitor',
 'IL23_inhibitor',
 'abatacept',
 'anti_BLyS',
 'S1P_receptor_modulator',
 'JAK_inhibitor',
 'integrin_inhibitor',
 'PDE4i_targeted_synthetic',
 'anti_CD20',
 'anti_CD52',
 'budesonide',
 'systemic_glucocorticoids',
 'monoclonal_antibody_covid_19', 
 )

In [0]:
imids_training_pd_df = imids_training_df.toPandas()

## change the patient id col to be the index of the training data
imids_training_pd_df = imids_training_pd_df.set_index("patient_id")
display(imids_training_pd_df)

results,hospitalized_after_positive,IMV_after_positive,death_after_positive,age,age_range,BMI,ethnicity,race,CVX_name,sex,Vaccination_status,hypertension,diabetes_type1and2,atrial_fibrillation,coronary_artery_disease,heart_failure,chronic_kidney_disease,copd,obesity,chronic_liver_disease,malignant_neoplastic_disease,asthma,HIV,history_transplant,stroke,opioid_dependence,ibd,rheumatoid_arthritis,multiple_sclerosis,psoriatic_arthritis,psoriasis,systemic_sclerosis,spondyloarthritis,systemic_lupus,vasculitis,sarcoidosis,APS,sjogren_syndrome,hydroxychloroquine,methotrexate,leflunomide_teriflunomide,5_ASA,azathioprine,mercaptopurine,mitoxantrone,mycophenolate,calcineurin_inhibitor,TNF_alpha_inhibitor,fumarates,interferons,alkylating_agent,hydroxyurea,dapsone,cladribine,IL1_inhibitor,IL6_inhibitor,IL12_23_inhibitor,IL17_inhibitor,IL23_inhibitor,abatacept,anti_BLyS,S1P_receptor_modulator,JAK_inhibitor,integrin_inhibitor,PDE4i_targeted_synthetic,anti_CD20,anti_CD52,budesonide,systemic_glucocorticoids,monoclonal_antibody_covid_19
Negative,0,0,0,60.0,50-74,38.60639276308138,Not Hispanic,AIAN,,Female,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,15.0,0-17,22.38172339066572,Hispanic,Other,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,62.0,50-74,28.7594659571033,Not Hispanic,Other,Moderna COVID-19 Vaccine,Female,Fully,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,70.0,50-74,30.95167695660835,Not Hispanic,White,Pfizer-BioNTech COVID-19 Vaccine,Male,Booster,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,75.0,75+,48.18129442762514,Unknown,Unknown,Janssen COVID-19 Vaccine,Male,Booster,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,36.0,18-49,17.799215345266138,Not Hispanic,White,,Male,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Positive,0,0,0,17.0,0-17,27.284075066537667,Hispanic,Other,Pfizer-BioNTech COVID-19 Vaccine,Female,Fully,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,3.0,0-17,12.19385307520615,Not Hispanic,Black,,Male,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,33.0,18-49,23.411246822493645,Not Hispanic,Asian,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Negative,0,0,0,31.0,18-49,29.694543915541384,Not Hispanic,White,,Female,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Code for Table 1 in the poster

In [0]:
# Create the empty IMID count table
table1_df = pd.DataFrame(columns=["Characteristics", "Tested for COVID, n", "(% of total)", "COVID+, n", "(% of total tested)", "Hospitalized, n", "Hospitalized (% of COVID+)", "IMV, n", "IMV (% of COVID+)", "death, n","death (% of COVID+)"], index=range(5))
table1_df['Characteristics'] = ["Total patients", "Without IMIDs", "With IMIDs", "Rheumatologic IMIDs", "Other IMIDs"]

table1_df = table1_df.astype(str)
display(table1_df)

Characteristics,"Tested for COVID, n",(% of total),"COVID+, n",(% of total tested),"Hospitalized, n",Hospitalized (% of COVID+),"IMV, n",IMV (% of COVID+),"death, n",death (% of COVID+)
Total patients,,,,,,,,,,
Without IMIDs,,,,,,,,,,
With IMIDs,,,,,,,,,,
Rheumatologic IMIDs,,,,,,,,,,
Other IMIDs,,,,,,,,,,


In [0]:
row_num = 0

## Change to a subset suited for the current row definition
current_df = imids_training_pd_df

## Total tested
table1_df.iloc[row_num, table1_df.columns.get_loc("Tested for COVID, n")] = str(len(current_df.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total)")] = "{:.1%}".format(len(current_df.index)/len(imids_training_pd_df.index))

positive_total_tested = current_df.loc[(current_df["results"] == "Positive")]
table1_df.iloc[row_num, table1_df.columns.get_loc("COVID+, n")] = str(len(positive_total_tested.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total tested)")] = "{:.1%}".format(len(positive_total_tested.index)/len(current_df.index))

## Hospitalized
hospitalized_total = positive_total_tested.loc[(positive_total_tested["hospitalized_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized, n")] = str(len(hospitalized_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized (% of COVID+)")] = "{:.1%}".format(len(hospitalized_total.index)/len(positive_total_tested.index))

## IMV
invasive_mechanical_vent_total = positive_total_tested.loc[(positive_total_tested["IMV_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV, n")] = str(len(invasive_mechanical_vent_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV (% of COVID+)")] = "{:.1%}".format(len(invasive_mechanical_vent_total.index)/len(positive_total_tested.index))

## Death
death_total = positive_total_tested.loc[(positive_total_tested["death_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("death, n")] = str(len(death_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("death (% of COVID+)")] = "{:.1%}".format(len(death_total.index)/len(positive_total_tested.index))

In [0]:
row_num = 1

## Change to a subset suited for the current row definition
## APS: antiphospholipid_syndrome
current_df = imids_training_pd_df.loc[( (imids_training_pd_df["rheumatoid_arthritis"] == 0)&(imids_training_pd_df["spondyloarthritis"] == 0)&(imids_training_pd_df["systemic_lupus"] == 0)\
                                                  &(imids_training_pd_df["psoriatic_arthritis"] == 0)&(imids_training_pd_df["systemic_sclerosis"] == 0)\
                                      &(imids_training_pd_df["psoriasis"] == 0)&(imids_training_pd_df["ibd"] == 0)&(imids_training_pd_df["multiple_sclerosis"] == 0)\
                                                  &(imids_training_pd_df["vasculitis"] == 0)&(imids_training_pd_df["sarcoidosis"] == 0)&(imids_training_pd_df["APS"] == 0)\
                                       &(imids_training_pd_df["sjogren_syndrome"] == 0)\
                                      )]

## Total tested
table1_df.iloc[row_num, table1_df.columns.get_loc("Tested for COVID, n")] = str(len(current_df.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total)")] = "{:.1%}".format(len(current_df.index)/len(imids_training_pd_df.index))

positive_total_tested = current_df.loc[(current_df["results"] == "Positive")]
table1_df.iloc[row_num, table1_df.columns.get_loc("COVID+, n")] = str(len(positive_total_tested.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total tested)")] = "{:.1%}".format(len(positive_total_tested.index)/len(current_df.index))

## Hospitalized
hospitalized_total = positive_total_tested.loc[(positive_total_tested["hospitalized_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized, n")] = str(len(hospitalized_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized (% of COVID+)")] = "{:.1%}".format(len(hospitalized_total.index)/len(positive_total_tested.index))

## IMV
invasive_mechanical_vent_total = positive_total_tested.loc[(positive_total_tested["IMV_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV, n")] = str(len(invasive_mechanical_vent_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV (% of COVID+)")] = "{:.1%}".format(len(invasive_mechanical_vent_total.index)/len(positive_total_tested.index))

## Death
death_total = positive_total_tested.loc[(positive_total_tested["death_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("death, n")] = str(len(death_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("death (% of COVID+)")] = "{:.1%}".format(len(death_total.index)/len(positive_total_tested.index))

In [0]:
row_num = 2

## Change to a subset suited for the current row definition
current_df = imids_training_pd_df.loc[( (imids_training_pd_df["rheumatoid_arthritis"] == 1)|(imids_training_pd_df["spondyloarthritis"] == 1)|(imids_training_pd_df["systemic_lupus"] == 1)\
                                                  |(imids_training_pd_df["psoriatic_arthritis"] == 1)|(imids_training_pd_df["systemic_sclerosis"] == 1)\
                                      |(imids_training_pd_df["psoriasis"] == 1)|(imids_training_pd_df["ibd"] == 1)|(imids_training_pd_df["multiple_sclerosis"] == 1)\
                                                  |(imids_training_pd_df["vasculitis"] == 1)|(imids_training_pd_df["sarcoidosis"] == 1)|(imids_training_pd_df["APS"] == 1)\
                                       |(imids_training_pd_df["sjogren_syndrome"] == 1)\
                                      )]

## Total tested
table1_df.iloc[row_num, table1_df.columns.get_loc("Tested for COVID, n")] = str(len(current_df.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total)")] = "{:.1%}".format(len(current_df.index)/len(imids_training_pd_df.index))

positive_total_tested = current_df.loc[(current_df["results"] == "Positive")]
table1_df.iloc[row_num, table1_df.columns.get_loc("COVID+, n")] = str(len(positive_total_tested.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total tested)")] = "{:.1%}".format(len(positive_total_tested.index)/len(current_df.index))

## Hospitalized
hospitalized_total = positive_total_tested.loc[(positive_total_tested["hospitalized_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized, n")] = str(len(hospitalized_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized (% of COVID+)")] = "{:.1%}".format(len(hospitalized_total.index)/len(positive_total_tested.index))

## IMV
invasive_mechanical_vent_total = positive_total_tested.loc[(positive_total_tested["IMV_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV, n")] = str(len(invasive_mechanical_vent_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV (% of COVID+)")] = "{:.1%}".format(len(invasive_mechanical_vent_total.index)/len(positive_total_tested.index))

## Death
death_total = positive_total_tested.loc[(positive_total_tested["death_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("death, n")] = str(len(death_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("death (% of COVID+)")] = "{:.1%}".format(len(death_total.index)/len(positive_total_tested.index))

In [0]:
row_num = 3

## Change to a subset suited for the current row definition
current_df = imids_training_pd_df.loc[((imids_training_pd_df["rheumatoid_arthritis"] == 1)|(imids_training_pd_df["spondyloarthritis"] == 1)|(imids_training_pd_df["systemic_lupus"] == 1)\
                                                  |(imids_training_pd_df["psoriatic_arthritis"] == 1)|(imids_training_pd_df["systemic_sclerosis"] == 1)\
                                                 |(imids_training_pd_df["vasculitis"] == 1)|(imids_training_pd_df["sarcoidosis"] == 1)|(imids_training_pd_df["APS"] == 1)\
                                       |(imids_training_pd_df["sjogren_syndrome"] == 1)\
                                       )]

## Total tested
table1_df.iloc[row_num, table1_df.columns.get_loc("Tested for COVID, n")] = str(len(current_df.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total)")] = "{:.1%}".format(len(current_df.index)/len(imids_training_pd_df.index))

positive_total_tested = current_df.loc[(current_df["results"] == "Positive")]
table1_df.iloc[row_num, table1_df.columns.get_loc("COVID+, n")] = str(len(positive_total_tested.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total tested)")] = "{:.1%}".format(len(positive_total_tested.index)/len(current_df.index))

## Hospitalized
hospitalized_total = positive_total_tested.loc[(positive_total_tested["hospitalized_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized, n")] = str(len(hospitalized_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized (% of COVID+)")] = "{:.1%}".format(len(hospitalized_total.index)/len(positive_total_tested.index))

## IMV
invasive_mechanical_vent_total = positive_total_tested.loc[(positive_total_tested["IMV_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV, n")] = str(len(invasive_mechanical_vent_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV (% of COVID+)")] = "{:.1%}".format(len(invasive_mechanical_vent_total.index)/len(positive_total_tested.index))

## Death
death_total = positive_total_tested.loc[(positive_total_tested["death_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("death, n")] = str(len(death_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("death (% of COVID+)")] = "{:.1%}".format(len(death_total.index)/len(positive_total_tested.index))

In [0]:
row_num = 4

## Change to a subset suited for the current row definition
current_df = imids_training_pd_df.loc[((imids_training_pd_df["ibd"] == 1)|(imids_training_pd_df["multiple_sclerosis"] == 1)\
                                                  |(imids_training_pd_df["psoriasis"] == 1)\
                                      )]

## Total tested
table1_df.iloc[row_num, table1_df.columns.get_loc("Tested for COVID, n")] = str(len(current_df.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total)")] = "{:.1%}".format(len(current_df.index)/len(imids_training_pd_df.index))

positive_total_tested = current_df.loc[(current_df["results"] == "Positive")]
table1_df.iloc[row_num, table1_df.columns.get_loc("COVID+, n")] = str(len(positive_total_tested.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("(% of total tested)")] = "{:.1%}".format(len(positive_total_tested.index)/len(current_df.index))

## Hospitalized
hospitalized_total = positive_total_tested.loc[(positive_total_tested["hospitalized_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized, n")] = str(len(hospitalized_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("Hospitalized (% of COVID+)")] = "{:.1%}".format(len(hospitalized_total.index)/len(positive_total_tested.index))

## IMV
invasive_mechanical_vent_total = positive_total_tested.loc[(positive_total_tested["IMV_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV, n")] = str(len(invasive_mechanical_vent_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("IMV (% of COVID+)")] = "{:.1%}".format(len(invasive_mechanical_vent_total.index)/len(positive_total_tested.index))

## Death
death_total = positive_total_tested.loc[(positive_total_tested["death_after_positive"] == 1)]
table1_df.iloc[row_num, table1_df.columns.get_loc("death, n")] = str(len(death_total.index))

## Use format to only keep one decimals
table1_df.iloc[row_num, table1_df.columns.get_loc("death (% of COVID+)")] = "{:.1%}".format(len(death_total.index)/len(positive_total_tested.index))

In [0]:
display(table1_df)

Characteristics,"Tested for COVID, n",(% of total),"COVID+, n",(% of total tested),"Hospitalized, n",Hospitalized (% of COVID+),"IMV, n",IMV (% of COVID+),"death, n",death (% of COVID+)
Total patients,1517295,100.0%,169993,11.2%,23330,13.7%,1072,0.6%,5294,3.1%
Without IMIDs,1433798,94.5%,161923,11.3%,22154,13.7%,1021,0.6%,4980,3.1%
With IMIDs,83497,5.5%,8070,9.7%,1176,14.6%,51,0.6%,314,3.9%
Rheumatologic IMIDs,47979,3.2%,4781,10.0%,758,15.9%,29,0.6%,226,4.7%
Other IMIDs,41231,2.7%,3868,9.4%,504,13.0%,27,0.7%,120,3.1%


In [0]:
## Run above to regenerate Table 1's results

## Using Tableone package to acquire numbers for Table 2 in the poster

In [0]:
## Uncomment to print out all colnames
list(imids_training_pd_df.columns)

In [0]:
imids_training_pd_df.columns = ['results','hospitalized_after_positive', 'IMV_after_positive', 'death_after_positive',
 'age','age group', 'BMI', 'ethnicity', 'race', 'CVX_name', 'sex', 'Vaccination_status',
 'hypertension', 'diabetes (type 1+2)', 'atrial fibrillation', 'coronary artery disease', 'heart failure', 'chronic kidney disease', 'COPD', 'obesity', 'chronic liver disease', 'malignant neoplastic disease', 'asthma', 'HIV','history of transplant', 'stroke', 'opioid dependence',
 'inflammatory bowel disease', 'rheumatoid arthritis', 'multiple sclerosis', 'psoriatic arthritis', 'psoriasis', 'systemic sclerosis', 'spondyloarthritis', 'systemic lupus', 'vasculitis', 'sarcoidosis', 'antiphospholipid syndrome', 'Sjögren syndrome',
 'hydroxychloroquine',
 'methotrexate',
 'leflunomide teriflunomide',
 '5-ASA',
 'azathioprine',
 'mercaptopurine',
 'mitoxantrone',
 'mycophenolate',
 'calcineurin inhibitor',
 'TNF-α inhibitor',
 'fumarates',
 'interferons',
 'alkylating agent',
 'hydroxyurea',
 'dapsone',
 'cladribine',
 'IL-1 inhibitor',
 'IL-6 inhibitor',
 'IL-12/23 inhibitor',
 'IL-17 inhibitor',
 'IL-23 inhibitor',
 'abatacept',
 'anti-BLyS',
 'S1P receptor modulator',
 'JAK inhibitor',
 'integrin inhibitor',
 'PDE4i targeted synthetic',
 'anti-CD20',
 'anti-CD52',
 'budesonide',
 'systemic glucocorticoids',
 'monoclonal antibody covid-19']

In [0]:
imids_training_pd_df["CVX_name"].fillna("No vaccinated", inplace = True)
imids_training_pd_df["Vaccination_status"].fillna("No vaccinated", inplace = True)

In [0]:
imids_training_pd_df.rename(columns = {'CVX_name':'Vaccination name', 'Vaccination_status':'Vaccination status'}, inplace = True)

In [0]:
## Define a list of columns to be included in Table 1:
columns = ['age','age group', 'BMI', 'ethnicity', 'race', 'Vaccination name', 'sex', 'Vaccination status',
 'hypertension', 'diabetes (type 1+2)', 'atrial fibrillation', 'coronary artery disease', 'heart failure', 'chronic kidney disease', 'COPD', 'obesity', 'chronic liver disease', 'malignant neoplastic disease', 'asthma', 'HIV','history of transplant', 'stroke', 'opioid dependence',
 'inflammatory bowel disease', 'rheumatoid arthritis', 'multiple sclerosis', 'psoriatic arthritis', 'psoriasis', 'systemic sclerosis', 'spondyloarthritis', 'systemic lupus', 'vasculitis', 'sarcoidosis', 'antiphospholipid syndrome', 'Sjögren syndrome',
 'hydroxychloroquine',
 'methotrexate',
 'leflunomide teriflunomide',
 '5-ASA',
 'azathioprine',
 'mercaptopurine',
 'mitoxantrone',
 'mycophenolate',
 'calcineurin inhibitor',
 'TNF-α inhibitor',
 'fumarates',
 'interferons',
 'alkylating agent',
 'hydroxyurea',
 'dapsone',
 'cladribine',
 'IL-1 inhibitor',
 'IL-6 inhibitor',
 'IL-12/23 inhibitor',
 'IL-17 inhibitor',
 'IL-23 inhibitor',
 'abatacept',
 'anti-BLyS',
 'S1P receptor modulator',
 'JAK inhibitor',
 'integrin inhibitor',
 'PDE4i targeted synthetic',
 'anti-CD20',
 'anti-CD52',
 'budesonide',
 'systemic glucocorticoids',
 'monoclonal antibody covid-19']

## Define a list of columns containing categorical variables
categorical = ['age group', 'ethnicity', 'race', 'Vaccination name', 'sex', 'Vaccination status',
 'hypertension', 'diabetes (type 1+2)', 'atrial fibrillation', 'coronary artery disease', 'heart failure', 'chronic kidney disease', 'COPD', 'obesity', 'chronic liver disease', 'malignant neoplastic disease', 'asthma', 'HIV','history of transplant', 'stroke', 'opioid dependence',
 'inflammatory bowel disease', 'rheumatoid arthritis', 'multiple sclerosis', 'psoriatic arthritis', 'psoriasis', 'systemic sclerosis', 'spondyloarthritis', 'systemic lupus', 'vasculitis', 'sarcoidosis', 'antiphospholipid syndrome', 'Sjögren syndrome',
 'hydroxychloroquine',
 'methotrexate',
 'leflunomide teriflunomide',
 '5-ASA',
 'azathioprine',
 'mercaptopurine',
 'mitoxantrone',
 'mycophenolate',
 'calcineurin inhibitor',
 'TNF-α inhibitor',
 'fumarates',
 'interferons',
 'alkylating agent',
 'hydroxyurea',
 'dapsone',
 'cladribine',
 'IL-1 inhibitor',
 'IL-6 inhibitor',
 'IL-12/23 inhibitor',
 'IL-17 inhibitor',
 'IL-23 inhibitor',
 'abatacept',
 'anti-BLyS',
 'S1P receptor modulator',
 'JAK inhibitor',
 'integrin inhibitor',
 'PDE4i targeted synthetic',
 'anti-CD20',
 'anti-CD52',
 'budesonide',
 'systemic glucocorticoids',
 'monoclonal antibody covid-19'
]

## define non-normalized features for table 1
nonnormal = ['age', 'BMI']

## Define the column used for group by
groupby = 'results'

In [0]:
# imids_training_pd_df = imids_training_pd_df[imids_training_pd_df['results'] == 'Positive']
# ## Define the column used for group by
# groupby = None

In [0]:
## Create an instance of TableOne with the input arguments
mytable = TableOne(imids_training_pd_df, columns=columns, categorical=categorical, nonnormal=nonnormal, pval=False, pval_adjust="fdr_by", decimals=1,
                   groupby=groupby)

## Display the table using the tabulate method. The tablefmt argument allows the table to be displayed in multiple formats, including “github”, “grid”, “fancy_grid”, “rst”, “html”, and “latex”.:
print(mytable.tabulate(tablefmt="github"))

In [0]:
type(mytable.tableone)

table_df = mytable.tableone
table_df.reset_index(inplace=True)
# Convert entire DataFrame to string
table_df=table_df.applymap(str)

## Rename columns
table_df.columns = ['Feature', 'Values', 'Missing_num', 'Total_num', 'Negative_num', 'Positive_num']

display(table_df)

Feature,Values,Missing_num,Total_num,Negative_num,Positive_num
n,,,1517295,1347302,169993
"age, median [Q1,Q3]",,0.0,"52.0 [32.0,68.0]","53.0 [33.0,69.0]","46.0 [29.0,63.0]"
"age group, n (%)",0-17,0.0,138738 (9.1),123080 (9.1),15658 (9.2)
"age group, n (%)",18-49,,563633 (37.1),486579 (36.1),77054 (45.3)
"age group, n (%)",50-74,,589961 (38.9),531543 (39.5),58418 (34.4)
"age group, n (%)",75+,,224963 (14.8),206100 (15.3),18863 (11.1)
"BMI, median [Q1,Q3]",,0.0,"28.5 [24.2,34.2]","28.5 [24.1,34.0]","29.1 [25.1,35.3]"
"ethnicity, n (%)",Hispanic,0.0,229746 (15.1),185859 (13.8),43887 (25.8)
"ethnicity, n (%)",Not Hispanic,,1239131 (81.7),1118529 (83.0),120602 (70.9)
"ethnicity, n (%)",Unknown,,48418 (3.2),42914 (3.2),5504 (3.2)


In [0]:
## Please check to make sure no missing values before droping this column
# table_df = table_df.drop(['Missing_num'], axis=1)

## Drop all rows with value "no", 'Male', 'Negative', 'Positive'
table_filtered_df = table_df[~table_df['Values'].isin(['0', '0.0', 'Female', 'Negative', 'Positive'])]

display(table_filtered_df)

Feature,Values,Missing_num,Total_num,Negative_num,Positive_num
n,,,1517295,1347302,169993
"age, median [Q1,Q3]",,0.0,"52.0 [32.0,68.0]","53.0 [33.0,69.0]","46.0 [29.0,63.0]"
"age group, n (%)",0-17,0.0,138738 (9.1),123080 (9.1),15658 (9.2)
"age group, n (%)",18-49,,563633 (37.1),486579 (36.1),77054 (45.3)
"age group, n (%)",50-74,,589961 (38.9),531543 (39.5),58418 (34.4)
"age group, n (%)",75+,,224963 (14.8),206100 (15.3),18863 (11.1)
"BMI, median [Q1,Q3]",,0.0,"28.5 [24.2,34.2]","28.5 [24.1,34.0]","29.1 [25.1,35.3]"
"ethnicity, n (%)",Hispanic,0.0,229746 (15.1),185859 (13.8),43887 (25.8)
"ethnicity, n (%)",Not Hispanic,,1239131 (81.7),1118529 (83.0),120602 (70.9)
"ethnicity, n (%)",Unknown,,48418 (3.2),42914 (3.2),5504 (3.2)


### Get the pts number for composite response

In [0]:
## Possible response vectors
Y_cols = ["hospitalized_after_positive", 'IMV_after_positive', 'death_after_positive']

## Create composite response vectors
train_df_Y_new = pd.DataFrame(imids_training_pd_df, columns = Y_cols)

train_df_Y_new['hospitalized_or_IMV_or_death'] = imids_training_pd_df['hospitalized_after_positive'] + imids_training_pd_df['IMV_after_positive'] + imids_training_pd_df['death_after_positive']
train_df_Y_new['IMV_or_death'] = imids_training_pd_df['IMV_after_positive'] + imids_training_pd_df['death_after_positive']

## Convert those >1 values back to 1
train_df_Y_new.loc[train_df_Y_new['hospitalized_or_IMV_or_death'] >= 1, 'hospitalized_or_IMV_or_death'] = 1
train_df_Y_new.loc[train_df_Y_new['IMV_or_death'] >= 1, 'IMV_or_death'] = 1

In [0]:
## Print out sum of each composite response columns
Total_hospitalized_or_IMV_or_death = train_df_Y_new['hospitalized_or_IMV_or_death'].sum()
print(Total_hospitalized_or_IMV_or_death)

Total_IMV_or_death = train_df_Y_new['IMV_or_death'].sum()
print(Total_IMV_or_death)