### Notice!!! Please use search function to check all the cells with "notice" keywords, those are sanity checks and potential manual variable changes

In [0]:
from pyspark.sql.functions import lower, col, lit, when, unix_timestamp, months_between, expr, countDistinct, count
from datetime import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, FloatType

In [0]:
#########################################################################################################
## Please define the target Timestamp for the end date for the generation of COVID patients dataset
## Notice: Format: YYYY-MM-DD
## Current set to 2021-12-10
############################################################################################################
## check to make sure they are consistent
## Previous dates used:
## 1. "2021-12-31", "20211231"
## 2. "2022-02-18", "20220218"
## 3. end_date, file_date = "2021-12-25", "20211225"

start_date, end_date, file_date = "2020-03-01", "2021-12-25", "20211225_Lancet_bmi"
## register it so can be directly used in the following SQL statement
spark.conf.set("startdate.var", end_date)
spark.conf.set("enddate.var", end_date)

## Define the instance number
intsance_num = 1000
spark.conf.set("intsance.var", intsance_num)

In [0]:
all_covid = spark.sql("SELECT * FROM rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity_nOneTimeCovidPatient_wFirstPosDate".format(file_date))
all_covid.createOrReplaceTempView("all_covid")
# print("Total number of records in df:", all_covid.count())

## Add back instance column, if you already included instance then skip this part
all_covid = all_covid.withColumn('instance', lit(1000))

## Print the top 5 rows
# adults_covid.limit(5).toPandas()

## ADD CONDITIONS

In [0]:
%run
"/Users/qi.wei1@providence.org/working_folder/concept_proof_codes/add_feature_functions_test/add_cond_features_at_decided_index_date"

In [0]:
only_instance_1k = True
##omop_table_version
omop_table_version = "2022_06_27"

list_of_risk_factors = [
                        ## Comorbidities
                        ## Previously removed "DVT","Myocardial_infarction","Ch_liver_disease","PVD"
                        'hypertension', 'diabetes_type1and2', 'atrial_fibrillation', 'coronary_artery_disease', 'heart_failure', 'chronic_kidney_disease', 'copd', 'obesity', 'chronic_liver_disease', 'malignant_neoplastic_disease',
                        ## Added based on CDC website
                        'asthma', 'HIV', 'history_transplant', 'stroke', 'opioid_dependence',
                        ## IMIDs
                        ## Removed list: 'uveitis'
                        'ibd', 'rheumatoid_arthritis', 'multiple_sclerosis', 'psoriatic_arthritis', 'psoriasis', 'systemic_sclerosis', 'spondyloarthritis', 'systemic_lupus', 'vasculitis',
  'sarcoidosis', 'APS', 'sjogren_syndrome'  
                       ]

##, conditions_cc_registry, patches_cc_registry
condition_df = add_risk_factors_active_at_decided_index_date(all_covid, list_of_risk_factors, only_instance_1k, omop_table_version)

In [0]:
### Save table
### Old naming convention, just for record: qw_df_imidproject_16types_nOneTimeCovidPatient_wVax
###

# # Drop duplicates before saving to save more time and resources
condition_df = condition_df.dropDuplicates()

spark.sql("""DROP TABLE IF EXISTS rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_{}""".format(file_date))
table_name = "rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_{}".format(file_date)
condition_df.write.mode("overwrite").format("delta").saveAsTable(table_name)

## Read the table
imids_cohort = spark.sql("SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_{}".format(file_date))

## Convert the age column from double type into float format
imids_cohort = imids_cohort.withColumn('age', imids_cohort['age'].cast( FloatType() ) )

## cleanup, no longer needed
# imids_cohort = imids_cohort.withColumn('decided_index_date', F.col('covidPositive_first_contact_date'))

In [0]:
# imids_cohort = all_covid.dropDuplicates()

## Add IMID Medications
- can resume from this checkpoint

In [0]:
%run
"/Users/qi.wei1@providence.org/working_folder/concept_proof_codes/add_feature_functions_test/add_meds_features_prior_and_after"

### All XX types of medications
- As confirmed in final design email
- Check this table for details of which medication in which type: 
- https://docs.google.com/document/d/1ABA0DmZObNNUDNJd2TKS0nuY0F8J7zU6GWrPIF2PeHc/edit

In [0]:
## Define possible routes
possible_routes = ['Oral','Intramuscular', 'Intravenous', 'Subcutaneous Infusion', 'Subcutaneous', 'Intravenous (Continuous Infusion)', 'Rectal']

## Define number_days_prior
number_days_prior = 91

## define whether to filter only instance = 1000
only_instance_1k = True

##omop_table_version
omop_table_version = "2022_06_27"

In [0]:
## 0. load the meds_pts_df from the temp view
tmp =  spark.sql("""select pat_id, instance, rxnormcode, medication_name, orderclass, ordermode, route, rxnormcode, medord_startdate, medord_endate from qw_med_join_table""")
tmp = tmp.withColumnRenamed('pat_id', 'pat_id2').withColumnRenamed('instance', 'instance2')
    
### Directly join with the previous condition dataframe
cond = [imids_cohort.pat_id == tmp.pat_id2, imids_cohort.instance == tmp.instance2]
med_pts_df = imids_cohort.select("pat_id", "instance", "decided_index_date").dropDuplicates().join(tmp, 
                       cond, how = "inner").drop(tmp.pat_id2).drop(tmp.instance2)

In [0]:
## hydroxychloroquine: hydroxychloroquine
med_list, med_name = ['hydroxychloroquine'], 'hydroxychloroquine'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
## methotrexate: methotrexate
med_list, med_name = ['methotrexate'], 'methotrexate'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
## leflunomide+teriflunomide: leflunomide, teriflunomide
med_list, med_name = ['leflunomide', 'teriflunomide'], 'leflunomide_teriflunomide'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
## Five_ASA: balsalazide, mesalamine, sulfasalazine

med_list, med_name = ['Five_ASA'], '5_ASAa'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['azathioprine'], 'azathioprine'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['mercaptopurine'], 'mercaptopurine'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['mitoxantrone'], 'mitoxantrone'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['mycophenolate'], 'mycophenolate'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['Calcineurin_inhibitor'], 'calcineurin_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['TNF_inhibitor'], 'TNF_alpha_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['fumarates'], 'fumarates'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['interferons'], 'interferons'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['alkylating'], 'alkylating_agent'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['hydroxyurea'], 'hydroxyurea'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['dapsone'], 'dapsone'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['cladribine'], 'cladribine'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['IL1_inhibitor'], 'IL1_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['IL6_inhibitor'], 'IL6_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['IL12_23_inhibitor'], 'IL12_23_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['IL17_inhibitor'], 'IL17_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['IL23_inhibitor'], 'IL23_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['abatacept'], 'abatacept'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['belimumab'], 'anti_BLyS'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['S1P_receptor_modulators'], 'S1P_receptor_modulator'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['JAKi'], 'JAK_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['Integrin_modulator'], 'integrin_inhibitor'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['PDE4i_targeted_synthetic'], 'PDE4i_targeted_synthetic'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['cd20'], 'anti_CD20'
### Run it using function 
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['cd52'], 'anti_CD52'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['budesonide'], 'budesonide'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
med_list, med_name = ['steroids'], 'systemic_glucocorticoids'
### Run it using function
imids_cohort = add_med_prior_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, number_days_prior, only_instance_1k, omop_table_version)

In [0]:
monoclonal_antibody_possible_routes = ['Oral','Intramuscular', 'Intravenous', 'Subcutaneous Infusion', 'Subcutaneous', 'Intravenous (Continuous Infusion)', 'Rectal']
monoclonal_antibody_number_days_after = 10

med_list, med_name = ['monoclonal_antibody_covid_19'], 'monoclonal_antibody_covid_19'
imids_cohort = add_med_after_usage_with_drugNames_possibleRoutes_medName(imids_cohort, med_pts_df, med_list, possible_routes, med_name, monoclonal_antibody_number_days_after, only_instance_1k, omop_table_version)

## add vaccination information

In [0]:
%sql 
DROP VIEW IF EXISTS rdp_phi_sandbox.qw_Vax_Immunization;
CREATE VIEW rdp_phi_sandbox.qw_Vax_Immunization AS
SELECT
  immunization.pat_id,
  immunization.instance,
  immunization.immunizationdate,
  immunization.immunzatn_id,
  immunizationcode.cvxcode,
  immunizationstatus
FROM
  rdp_phi.immunization
  INNER JOIN rdp_phi.immunizationcode ON immunization.immunzatn_id=immunizationcode.immunzatn_id AND immunization.instance=immunizationcode.instance
WHERE
  immunizationcode.cvxcode in ('207','208','212')
  AND immunizationdate <= TIMESTAMP('${enddate.var}')
AND 
  immunization.instance = 1000
GROUP BY
  immunization.pat_id,
  immunization.instance,
  immunization.immunizationdate,
  immunization.immunzatn_id,
  immunizationcode.cvxcode,
  immunizationstatus

In [0]:
#from pyspark.sql.types import *
#from pyspark.sql.functions import *

import re
import numpy as np
import pandas as pd
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql import DataFrameNaFunctions
from pyspark.sql import functions as F  
from pyspark.sql import GroupedData
from pyspark.sql import Row

from pyspark.sql.functions import col, lower, mean, bround, when, unix_timestamp
from pyspark.sql.window import Window
from matplotlib import pyplot as plt
from pyspark.ml.feature import Imputer

# pd.set_option('max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [0]:
### Read the stored data frame
vax = spark.sql("""SELECT pat_id,instance,immunizationdate,immunzatn_id,cvxcode,immunizationstatus FROM rdp_phi_sandbox.qw_Vax_Immunization""")

###Filter the vaccines that were given for sure
vax = vax.filter(vax.immunizationstatus == "Given")
vax = vax.dropDuplicates(['pat_id','instance','immunizationdate'])

###################################################################
### Notice: currently the required interval between doses >= 21
### Could be changed based on future research
### Remove the ones with less than 21 days difference from the previous dose
w = Window.partitionBy(['pat_id','instance']).orderBy("immunizationdate")
vax = vax.withColumn("prev_noteddate", F.lag(vax.immunizationdate).over(w))
vax = vax.withColumn('dayssince_lastdose', F.datediff(F.col('immunizationdate'), F.col('prev_noteddate')))
vax = vax.filter((vax.dayssince_lastdose.isNull()) | (vax.dayssince_lastdose >= 21)) ### This is for choosing the first dose and doses that were given after 21 days from the previous dose

### Count the number of doses
w = Window.partitionBy(['pat_id','instance']).orderBy("immunizationdate").rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
vax = vax.withColumn("Administered_dose_count",count(col("immunizationdate")).over(w))

### Find the last dose date and the boosters date
vax = vax.withColumn("lastdose_date", when(vax.cvxcode == 212,(F.collect_list(F.col("immunizationdate")).over(w))[0]).otherwise((F.collect_list(F.col("immunizationdate")).over(w))[1]))
vax = vax.withColumn("LTF_date", when(vax.cvxcode == 212,(F.collect_list(F.col("immunizationdate")).over(w))[1]).otherwise((F.collect_list(F.col("immunizationdate")).over(w))[2]))

### Remove the patients who got vaccinated within 21 days of the end date
vax = vax.filter(datediff(to_date(lit(end_date)),vax.lastdose_date)>=21)

vax = vax.dropDuplicates(['pat_id','instance'])

### Drop unneccesary duplicated columns
drop_cols = ['immunizationdate', 'immunizationstatus', 'immunzatn_id']
vax = vax.drop(*drop_cols)

In [0]:
###Check if fully vaccinated at time positive covid test or not
vax = vax.withColumn("Vaccination_status",when((col('cvxcode')=="212") & (col('Administered_dose_count')==1) , "Fully").
                                 when((col('cvxcode')=="207") & (col('Administered_dose_count')==2) , "Fully").
                                 when((col('cvxcode')=="208") & (col('Administered_dose_count')==2) , "Fully").
                                 when((col('cvxcode')=="212") & (col('Administered_dose_count')>1) , "Booster").
                                 when((col('cvxcode')=="207") & (col('Administered_dose_count')>2) , "Booster").
                                 when((col('cvxcode')=="208") & (col('Administered_dose_count')>2) , "Booster").
                                 otherwise("NotFully"))

###Name the vaccines
vax = vax.withColumn("CVX_name",when(col('cvxcode')=="212", "Janssen COVID-19 Vaccine").
                                 when(col('cvxcode')=="207", "Moderna COVID-19 Vaccine").
                                 when(col('cvxcode')=="208", "Pfizer-BioNTech COVID-19 Vaccine").
                                 otherwise("Unknown"))

vax = vax.filter((vax.Vaccination_status == "Fully") | (vax.Vaccination_status == "Booster"))

vax.createOrReplaceTempView('qw_vaccinated_population')

In [0]:
covid_tests_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_{}_covid_tests""".format(file_date))
Pos_Covid_df = covid_tests_df.filter(covid_tests_df.results=="positive")
Pos_Covid_df.createOrReplaceTempView('qw_Pos_Covid')

In [0]:
Previous_Pos = spark.sql("""
SELECT 
  CONCAT(qw_vaccinated_population.instance, qw_vaccinated_population.pat_id) as patient_id, 
  qw_vaccinated_population.pat_id,
  qw_vaccinated_population.instance,
  results,
  lastdose_date,
  covidPositive_first_contact_date
FROM 
  (qw_vaccinated_population
INNER JOIN 
  qw_Pos_Covid
ON qw_vaccinated_population.pat_id = qw_Pos_Covid.pat_id AND  qw_vaccinated_population.instance = qw_Pos_Covid.instance)
WHERE 
  (covidPositive_first_contact_date < lastdose_date)
GROUP BY 
  qw_vaccinated_population.pat_id,
  qw_vaccinated_population.instance,
  results,
  lastdose_date,
  covidPositive_first_contact_date""")

In [0]:
vax_pos_afterLastDose = vax.join(Previous_Pos,["pat_id",'instance'],'leftanti')

In [0]:
## Use left join to keep only those patients we found with disease + condition info
IMID_cond_vax_df = imids_cohort.join(vax_pos_afterLastDose, on = "pat_id", how = "left").drop(vax_pos_afterLastDose.pat_id)
# IMID_cond_vax_df = IMID_cond_vax_df.drop(col("instance"))

In [0]:
cols_to_drop = ['instance', 'who_score_max', 'days_to_max_who_score', 'days_hospitalized', 'encounter_duration',
 'encountertype', 'visittype',
 'contact_date', 'contactdate_before_covid19date',
 'admissiondatetime', 'first_contact_date',
 'covidPositive_first_contact_date', 'cvxcode',
 'prev_noteddate', 'dayssince_lastdose',
 'Administered_dose_count', 'lastdose_date', 'LTF_date']

IMID_cond_vax_df = IMID_cond_vax_df.drop(*cols_to_drop)

In [0]:
# ### Save table
# ### Old naming convention, just for record: qw_df_imidproject_16types_nOneTimeCovidPatient_wVax
# ###

## Drop duplicates before saving to save more time and resources
IMID_cond_vax_df = IMID_cond_vax_df.dropDuplicates()

spark.sql("""DROP TABLE IF EXISTS rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_vax_{}""".format(file_date))
table_name = "rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_vax_{}".format(file_date)
IMID_cond_vax_df.write.mode("overwrite").format("delta").saveAsTable(table_name)

## Read the table
# cond_med_vax_geo_df = spark.sql("SELECT * FROM rdp_phi_sandbox.qw_IMID_COVID_trainset_cond_med_vax_R6_{}".format(file_date))

## The following is not used in the current COVID-IMID project

### Add Geo-address features

##### 7 geo features available
* 'ruca2010revised'
* 'countytypologycodes2015'
* 'farcodeszip2010'
* 'ruralurbancontinuumcodes2013'
* 'urbaninfluencecodes2013'
* 'svi2018_us'
* 'svi2018_us_county'

### newer version from Yeon Mi

#### original files located
* https://drive.google.com/drive/u/0/folders/1DyNiZ2dvQ0Fg4NxGdWeW0rLfYP_8Hl-t
#### documentation located
* https://docs.google.com/document/d/12G3Cs_OM1xuGRgejoOAMzEQEpTEssIHbKUwxm4ej2l8/edit
#### rural and urban data
* https://www.ers.usda.gov/data-products/
#### SVI documentation
* https://svi.cdc.gov/Documents/Data/2018_SVI_Data/SVI2018Documentation.pdf

In [0]:
# "/Users/jennifer.hadlock2@providence.org/Jenn - Hadlock Lab Shared/CEDA Tools v1.0/load_ceda_etl_tools"

In [0]:
# def add_geo_features(cohort_df, geo_df_name, join_cols = ['pat_id', 'instance']):
#   geodf_list = ['ruca2010revised', 'countytypologycodes2015', 'farcodeszip2010', 'ruralurbancontinuumcodes2013', 'urbaninfluencecodes2013', 'svi2018_us', 'svi2018_us_county']
#   master_patient = spark.sql("SELECT * FROM rdp_phi.dim_patient_master").select('pat_id','instance', 'PATIENT_STATE_CD', 'PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED', 'ZIP')
  
#   if geo_df_name not in geodf_list:
#     print ('incorrect geo df name')
#   else:
#     geo_df = spark.sql("SELECT * from rdp_phi_sandbox.{0}".format(geo_df_name))
#     if geo_df_name == 'ruca2010revised':
#       geo_df = geo_df.withColumn('FIPS', F.col('State_County_Tract_FIPS_Code').cast(StringType())).drop('State_County_Tract_FIPS_Code')
#       master_patient = master_patient.withColumn("FIPS", F.expr("CASE WHEN PATIENT_STATE_CD = 'CA' THEN substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 2, length(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED)-2) ELSE substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 0, length(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED)-1) END"))
#       joined_df = master_patient.join(geo_df, 'FIPS', 'inner')
#     elif geo_df_name == 'svi2018_us':
#       ###################################################################
#       ## Only take the summary theme ranking variables, detailed below:
#       ## • Socioeconomic - RPL_THEME1
#       ## • Household Composition & Disability - RPL_THEME2
#       ## • Minority Status & Language - RPL_THEME3
#       ## • Housing Type & Transportation - RPL_THEME4
#       ## The overall tract summary ranking variable is RPL_THEMES
#       ## Notice!!! Should be left join!!
#       ###################################################################
#       geo_df = geo_df.select('FIPS', 'RPL_THEME1', 'RPL_THEME2', 'RPL_THEME3', 'RPL_THEME4', 'RPL_THEMES')
#       master_patient = master_patient.withColumn("FIPS", F.expr("substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 0, length(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED)-1)"))
#       joined_df = master_patient.join(geo_df, 'FIPS', 'inner')
      
#     elif ((geo_df_name == 'countytypologycodes2015')|(geo_df_name == 'urbaninfluencecodes2013')):
#       #######################################################################
#       ## Works for countytypologycodes2015
#       ## Only take the Metro-nonmetro status feature collected in 2013 
#       ## 0=Nonmetro 1=Metro
#       ########################################################################
#       geo_df = geo_df.select('FIPStxt', 'Metro_nonmetrostatus2013', 'Low_Education_2015_Update', 'Low_Employment_Cnty_2008_2012_25_64')
#       geo_df = geo_df.withColumn('FIPS4', F.col('FIPStxt').cast(StringType())).drop('FIPStxt')
#       master_patient = master_patient.withColumn("FIPS4", F.expr("CASE WHEN PATIENT_STATE_CD = 'CA' THEN substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 2, 4) ELSE substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 0, 5) END"))
#       joined_df = master_patient.join(geo_df, 'FIPS4', 'inner')
      
#     elif ((geo_df_name == 'svi2018_us_county')|(geo_df_name == 'ruralurbancontinuumcodes2013')):
#       geo_df = geo_df.withColumn('FIPS5', F.col('FIPS').cast(StringType()))
#       master_patient = master_patient.withColumn("FIPS5", F.expr("substring(PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED, 0, 5)"))
#       joined_df = master_patient.join(geo_df, 'FIPS5', 'inner')    
#     elif geo_df_name == 'farcodeszip2010':
#       geo_df = geo_df.withColumn('ZIP5', F.col('ZIP').cast(StringType())).drop('ZIP')
#       master_patient = master_patient.withColumn("ZIP5", F.expr("substring(ZIP, 0, 5)")).drop('ZIP')
#       joined_df = master_patient.join(geo_df, 'ZIP5', 'inner')
#     return_df = cohort_df.join(joined_df, join_cols, 'left')
#   return return_df

In [0]:
# ## Load the imported geo-address features table
# geo_df_temp = spark.sql("SELECT * from rdp_phi_sandbox.svi2018_us")

# ## make sure can return to this point
# # cond_med_vax_geo_df = IMID_cond_vax_df.select("*")

In [0]:
# ## load cohort dataframe: Here should be the df3
# ## run function 
# ## Notice: dont filter out the null rows after the merge, otherwise too many records will be dropped
# ## Here is the filter code from Yeon Mi, .filter(F.col("FIPS").isNotNull())

# cond_med_vax_geo_df = add_geo_features(cond_med_vax_geo_df, 'svi2018_us')

# ## Loop and fix those negative numbers in SVI
# SVI_score = ['RPL_THEMES', #overall tract summary ranking variable 
#              'RPL_THEME1', #socioeconomic ranking variable 
#              'RPL_THEME2', #household composition and disability 
#              'RPL_THEME3', #minority status and language 
#              'RPL_THEME4']  #housing type and transportation 

# for svi in SVI_score:
#   cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn(svi, F.col(svi).cast(FloatType())) \
#                          .withColumn(svi, F.when(F.col(svi)<0, None).otherwise(F.col(svi)))

# ## Drop not needed cols
# cond_med_vax_geo_df = cond_med_vax_geo_df.drop('FIPS','PATIENT_STATE_CD', 'PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED', 'ZIP')
# # df4.limit(10).toPandas()

In [0]:
# # ## load cohort dataframe: Here should be the df3
# # ## run function
# ## Notice: dont filter out the null rows after the merge, otherwise too many records will be dropped
# ## Here is the filter code from Yeon Mi, .filter(F.col("FIPS4").isNotNull())
# cond_med_vax_geo_df = add_geo_features(cond_med_vax_geo_df, 'countytypologycodes2015')

# ## Drop not needed cols
# cond_med_vax_geo_df = cond_med_vax_geo_df.drop('instance', 'FIPS4','PATIENT_STATE_CD', 'PATIENT_ADDR_CENSUS_BLOCKGROUP_DERIVED', 'ZIP')
# # # df5.limit(5).toPandas()

In [0]:
# ## Convert those added features into numeric ones
# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('RPL_THEME1', cond_med_vax_geo_df['RPL_THEME1'].cast( FloatType() ) )
# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('RPL_THEME2', cond_med_vax_geo_df['RPL_THEME2'].cast( FloatType() ) )
# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('RPL_THEME3', cond_med_vax_geo_df['RPL_THEME3'].cast( FloatType() ) )
# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('RPL_THEME4', cond_med_vax_geo_df['RPL_THEME4'].cast( FloatType() ) )
# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('RPL_THEMES', cond_med_vax_geo_df['RPL_THEMES'].cast( FloatType() ) )

# cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('Metro_nonmetrostatus2013', cond_med_vax_geo_df['Metro_nonmetrostatus2013'].cast( IntegerType() ) )
# cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('Low_Education_2015_Update', cond_med_vax_geo_df['Low_Education_2015_Update'].cast( IntegerType() ) )
# cond_med_vax_geo_df = cond_med_vax_geo_df.withColumn('Low_Employment_Cnty_2008_2012_25_64', cond_med_vax_geo_df['Low_Employment_Cnty_2008_2012_25_64'].cast( IntegerType() ) )

# # Rename those feature names into a more interpretable version
# cond_med_vax_geo_df = cond_med_vax_geo_df.withColumnRenamed('RPL_THEME1', 'SVI_Socioeconomic').withColumnRenamed('RPL_THEME2', 'SVI_Household_Composition_Disability')\
#                                         .withColumnRenamed('RPL_THEME3', 'SVI_Minority_Status_Language').withColumnRenamed('RPL_THEME4', 'SVI_Housing_Type_Transportation')\
#                                         .withColumnRenamed('RPL_THEMES', 'SVI')\
#                                         .withColumnRenamed('Metro_nonmetrostatus2013', 'Metro_area').withColumnRenamed('Low_Education_2015_Update', 'Low_education')\
#                                         .withColumnRenamed('Low_Employment_Cnty_2008_2012_25_64', 'Low_employment')

# # cond_med_vax_geo_df = cond_med_vax_geo_df.withColumnRenamed('RPL_THEME1', 'SVI_Socioeconomic').withColumnRenamed('RPL_THEME2', 'SVI_Household_Composition_Disability')\
# #                                         .withColumnRenamed('RPL_THEME3', 'SVI_Minority_Status_Language').withColumnRenamed('RPL_THEME4', 'SVI_Housing_Type_Transportation')\
# #                                         .withColumnRenamed('RPL_THEMES', 'SVI')

In [0]:
### Print out top 20 rows or print out list of columns
# IMID_cond_vax_df.limit(20).toPandas()

# cond_med_vax_geo_df.columns

In [0]:
# cols_to_drop = ['instance', 'who_score_max', 'days_to_max_who_score', 'days_hospitalized', 'encounter_duration',
#  'encountertype',
#  'visittype',
#  'contact_date',
#  'contactdate_before_covid19date',
#  'admissiondatetime',
#  'first_contact_date',
#  'covidPositive_first_contact_date','cvxcode',
#  'prev_noteddate',
#  'dayssince_lastdose',
#  'Administered_dose_count',
#  'lastdose_date',
#  'LTF_date']

# cond_med_vax_geo_df = cond_med_vax_geo_df.drop(*cols_to_drop)

In [0]:
# ## Save table
# ## Old naming convention, just for record: qw_df_imidproject_16types_nOneTimeCovidPatient_wVax
# ##

# ## Drop duplicates before saving to save more time and resources
# cond_med_vax_geo_df = cond_med_vax_geo_df.dropDuplicates()

# spark.sql("""DROP TABLE IF EXISTS rdp_phi_sandbox.qw_IMID_COVID_trainset_vax_cond_med_geo_r5_{}""".format(file_date))
# table_name = "rdp_phi_sandbox.qw_IMID_COVID_trainset_vax_cond_med_geo_r5_{}".format(file_date)
# cond_med_vax_geo_df.write.mode("overwrite").format("delta").saveAsTable(table_name)

## Task completed, continue to step 5
### step 5.1: Logistic regression model
### step 5.2: XGBoost model
- /Users/jennifer.hadlock2@providence.org/GreenerGrass/Qi/Transfer-Qi/COVID-IBD-IMIDs-paper/EULAR_Poster_Abstract_submitted/Step5.1_ALL_IMID_Logistic_regression_analysis
- /Users/jennifer.hadlock2@providence.org/GreenerGrass/Qi/Transfer-Qi/COVID-IBD-IMIDs-paper/EULAR_Poster_Abstract_submitted/Step5.2_ALL_IMID_XGB_analysis

In [0]:
######################################################
# The following code are for additional analysis
######################################################
print("Task completed, continue to step 5")