### Notice!!! Please use search function to check all the cells with "notice" keywords, those are sanity checks and potential manual variable changes

In [0]:
###################################
## Import all necessary Packages
##################################
import re
import numpy as np
import pandas as pd
import os
import sys
from pyspark.sql import SparkSession, Column, DataFrameNaFunctions, DataFrameStatFunctions, GroupedData, Row
from pyspark.sql import functions as F  
from pyspark.sql.functions import concat, col, lower, mean, bround, when, unix_timestamp, split
from pyspark.sql.types import *
from pyspark.sql.window import Window
from matplotlib import pyplot as plt
from pyspark.ml.feature import Imputer

## register options
# pd.set_option('max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#########################################################################################################
## Please define the target Timestamp for the end date for the generation of COVID patients dataset
## Notice: Format: YYYY-MM-DD
## Current set to 2022-01-14
############################################################################################################

## check to make sure they are consistent
## previous file_date used: 
## 1. 2021-12-31, 20211231
## 2. 2022-01-14, 20220114
start_date, end_date, file_date = "2020-03-01", "2021-12-25", "20211225_Lancet_bmi"
who_score_table_version = "2022_08_30"
## register it so can be directly used in the following SQL statement
spark.conf.set("startdate.var", start_date)
spark.conf.set("enddate.var", end_date)

## Define the instance number
intsance_num = 1000
spark.conf.set("intsance.var", intsance_num)

In [0]:
#####################################
## Function to get cleaned string
## Input: String
## Output: String
####################################
def clean_string(input_string):
  """Clean an arbitrary input string

  Parameters:
  input_string (str): An arbitrary input string

  Returns:
  str: Cleaned string

  """
  ## Return empty string if input_string is None
  if input_string is None:
    return ''
  
  ## Characters to remove
  remove_char = ['.', '*', ':', '-', '(', ')', '_']
  
  ## Remove non-alphanumeric characters
  cleaned_string = input_string
  for char_ in remove_char:
    cleaned_string = cleaned_string.replace(char_, '')
  
  ## Lowercase, remove leading and trailing whitespace, and replace duplicate spaces within the string
  cleaned_string = re.sub(' +', ' ', cleaned_string.strip().lower())
  
  return cleaned_string

## Create a UDF
clean_string_udf = F.udf(clean_string, StringType())

## Register UDFs so they can be used in SQL statements
spark.udf.register("clean_string", clean_string)

###########################################################################################################################
## Function to map result value strings for COVID-19 tests to 'positive', 'negative', 'see report or comment' or 'unknown'
## Input: resultvalue column
## Output: mapped positiv or negative results
## Notice: Please check the positive_strings, negative_strings, report_comment_strings, to include all new edge cases
##
###########################################################################################################################
def covid19_resultvalue_map(resultvalue):
  positive_strings = ['positive', 'detected', 'presumptive pos', 'detected see scanned report', 'detcted', 'presumptive positive', 'postive', 'positve','inst positive','presumptive positive see scanned report','added detected', 'detected h', 'detect', 'detected p', 'pos 2019 ncov', 'postiive', 'pos', 'dectected', 'detectd', 'dtected', 'deteccted', 'detectred', 'dected', 'posative', 'ddetected', 'covid 19', 'deteceted', 'detecte', 'detectedd']
  
  negative_strings = ['none detected', 'undetected', 'not dectected', 'negative', 'not detected', 'not deteced', 'not detected see scanned result', 'not detectd', 'not detected see scanned report', 'negatiev', 'not detected see media', 'non detected', 'not dtected', 'not detected see scanned results', 'not detecte', 'negtive', 'not detected see scanned result', 'presumptive negative', 'negatvie', 'not detecteed',
                     'prsmptve neg covid','normal', 'not detect','not detected see', 'neg', 'negitive', 'revised not detected', 'not detected see scaneed report', 'added none detected', 'revised none detected', 'not detectedd', 'not dected', 'not deteccted', 'not deteceted', 'no detected', 'inst negative', 'notdetected', 'not detected test performed at sacred heart', 'noy detected', 'no detected', 'negatiive', 
                      'negaqtive', 'not reported', 'not detectred', 'neagtive', 'neggative', 'negaive', 'not ddetected', 'ng', 'nf', 'ngeative']
  
  report_comment_strings = ['see scanned report', 'comment', 'see scanned result', 'abnormal see scanned report', 'see report', 'see scanned results', 'see comments', 'see scanned report covid19', 'see comment', 'please see scanned report', 'see note', 'separate report to follow', 'see attached report', 'refer to separate reference lab report for results', 'other see scanned report', 'refer to separate reference lab report for results', 'See resp pcr']
  
  ## Clean the resultvalue string
  resultvalue_cleaned = clean_string(resultvalue)
  
  ## Return mapped string
  if(resultvalue_cleaned in positive_strings):
    return 'positive'
  elif(resultvalue_cleaned in negative_strings):
    return 'negative'
  elif(resultvalue_cleaned in report_comment_strings):
    return 'see report or comment'
  else:
    return 'unknown'

## Register UDFs so they can be used in SQL statements
spark.udf.register("covid19_resultvalue_map", covid19_resultvalue_map)

#################################################################################
## Function to assign result value strings for COVID-19 tests into categories
## Input: list
## Output: list
##############################################################################
def test_results_category(result_list):
  # Categorize patient based on one or more tests
  if 'positive' in result_list:
    return '>= 1 positive'
  elif ('negative' in result_list) and ('positive' not in result_list):
    return '>= 1 negative (no positive)'
  else:
    return 'inconclusive or unknown'

## Register UDFs so they can be used in SQL statements
spark.udf.register("test_results_category", test_results_category)

In [0]:
covid_tests_df = spark.sql(
"""
SELECT
  e.pat_id,
  e.instance,
  e.pat_enc_csn_id,
  date(e.contact_date),
  e.admissiondatetime,
  e.encountertype,
  e.patientclass,
  e.visittype,
  po.orderingdatetime,
  lr.observationdatetime,
  lr.basename,
  lr.commonname,  
  lr.resultname,
  lr.resultvalue,
  clean_string(lr.resultvalue) as result_cleaned,
  covid19_resultvalue_map(lr.resultvalue) as result_short,
  lr.flaggedas
FROM rdp_phi.encounter as e
JOIN (
  -- Join procedure orders for NAAT and PCR tests
  SELECT * FROM rdp_phi.procedureorders
  WHERE
    ordername LIKE '%NAAT%' OR
    ordername LIKE '%PCR%') as po
ON
  e.instance = po.instance AND
  e.pat_enc_csn_id = po.pat_enc_csn_id
JOIN (
  -- Join lab results to get results of COVID-19 tests
  SELECT * FROM rdp_phi.labresult
  WHERE commonname in ('SARS CORONAVIRUS 2 RNA ORD', 'POC SARS CORONAVIRUS 2 RNA ORD', 'SARS-COV-2 (COVID-19) QUAL PCR RESULT', 'COV19EX','NAA (COVID-19) (REF)')
or resultname in ('SARS CORONAVIRUS 2 RNA ORD', 'POC SARS CORONAVIRUS 2 RNA ORD', 'SARS-COV-2 (COVID-19) QUAL PCR RESULT', 'COV19EX','NAA (COVID-19) (REF)') ) as lr
ON
  po.instance = lr.instance AND
  po.order_proc_id = lr.order_proc_id
WHERE resultvalue IS NOT NULL
and e.CONTACT_DATE >= TIMESTAMP('${startdate.var}')
and e.CONTACT_DATE <= TIMESTAMP('${enddate.var}')
""")

## Create a table that can be queried using SQL-style syntax
covid_tests_df.createOrReplaceTempView('covid_tests')

## Print total number of records
# print('Total records: %s' % covid_tests_df.count())

## Use only for checking on the data
## covid_tests_df.filter("patientstatus == 'Hospital Inpatient Visit'").limit(100).toPandas()

In [0]:
first_encounter_df = spark.sql(
"""
SELECT
  pat_id,
  instance,
  first_pat_enc_csn_id,
  encountertype,
  patientclass
FROM (
  SELECT *,
    FIRST_VALUE(pat_enc_csn_id) OVER (
      PARTITION BY instance, pat_id
      ORDER BY contact_date
      RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as first_pat_enc_csn_id
  FROM covid_tests)
GROUP BY instance, pat_id, first_pat_enc_csn_id, encountertype, patientclass
ORDER BY pat_id, instance
""")

## Create a table that can be queried using SQL-style syntax
first_encounter_df.createOrReplaceTempView('first_encounter')


## Print total number of records
# print('Total records: %s' % first_encounter_df.count())

## Use only for checking on the data
## first_encounter_df.orderBy('pat_id', 'instance').limit(5).toPandas()

In [0]:
patient_instance_aggregate_df = spark.sql(
"""
WITH agg as (
SELECT
  pat_id,
  instance,
  concat_ws(";", collect_list(pat_enc_csn_id)) as pat_enc_csn_id,
  MIN(contact_date) as first_contact_date,
  MIN(date_of_positive_test) as covidPositive_first_contact_date,
  MIN(date_of_negative_test) as covidNegative_first_contact_date,
  DATE(MIN(admissiondatetime)) as first_admit_date,
  MAX(contact_date) as last_contact_date,
  DATE(MAX(admissiondatetime)) as last_admit_date,
  concat_ws(";", collect_list(observationdatetime)) as observationdatetime,
  concat_ws(";", collect_list(result_short)) as result_short,
  concat_ws(";", collect_list(encountertype)) as encountertype,
  concat_ws(";", collect_list(patientclass)) as patientclass,
  test_results_category(collect_list(result_short)) as results_category,
  concat_ws(";", collect_list(flaggedas)) as flaggedas,
  COUNT(*) as num_tests
FROM (
  SELECT *,
    (CASE WHEN result_short = 'positive' THEN contact_date ELSE NULL END) as date_of_positive_test,
    (CASE WHEN result_short = 'negative' THEN contact_date ELSE NULL END) as date_of_negative_test
  FROM covid_tests
  ORDER BY instance, pat_id, contact_date)
GROUP BY instance, pat_id)

SELECT
  agg.pat_id,
  agg.instance,
  agg.pat_enc_csn_id,
  fe.first_pat_enc_csn_id,
  agg.first_contact_date,
  agg.covidPositive_first_contact_date,
  agg.covidNegative_first_contact_date,
  agg.first_admit_date,
  agg.last_contact_date,
  agg.last_admit_date,
  agg.observationdatetime,
  agg.result_short,
  agg.results_category,
  agg.encountertype,
  agg.patientclass,
  agg.flaggedas,
  agg.num_tests
FROM agg
LEFT JOIN first_encounter as fe
ON
  agg.instance = fe.instance AND
  agg.pat_id = fe.pat_id
""")
## Create a table that can be queried using SQL-style syntax
patient_instance_aggregate_df.createOrReplaceTempView('patient_instance_aggregate')

## Print total number of records
# print('Total records: %s' % patient_instance_aggregate_df.count())

# patient_instance_aggregate_df.limit(5).toPandas()
## Notice: DEFINE PositiveCOVID_contact_dt: The contact date of the encounter(s) when they got their first COVID-19 test positive

In [0]:
covid_tests_df = spark.sql(
"""\
SELECT
pat_id, 
instance, 
first_contact_date, 
covidPositive_first_contact_date,
covidNegative_first_contact_date,
encountertype,
patientclass,
results_category
FROM patient_instance_aggregate
""")

## Create a table that can be queried using SQL-style syntax
# covid_tests_df.createOrReplaceTempView('covid_tests')

covid_tests_df = covid_tests_df.dropDuplicates()

## Print total number of records
# print('Total records: %s' % covid_tests_df.count())

## print the first 20 records
# covid_tests_df.limit(20).toPandas()

In [0]:
## Convert results_category as strings with a new column and delete results_category
from pyspark.sql.functions import col, when
covid_tests_df = covid_tests_df.withColumn("results", when(col("results_category") == ">= 1 negative (no positive)","Negative")
                                 .when(col("results_category") == ">= 1 positive","Positive")
                                 .otherwise("Unknown"))

# covid_tests_df.groupBy("results").count().show(truncate=False)
# covid_tests_df.groupBy("results_category").count().show(truncate=False)

covid_tests_df = covid_tests_df.drop("results_category")

In [0]:
## save the current covid tests results table
save_table_name = 'rdp_phi_sandbox.qw_{}_covid_tests'.format(file_date)
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_covid_tests".format(file_date))
covid_tests_df.write.saveAsTable(save_table_name)

## Merge covid tests results table and who scores table

In [0]:
##########################################################################
# Name of covid test file depends on the previous end date selected
################################################################
read_table_name = 'rdp_phi_sandbox.qw_{}_covid_tests'.format(file_date)

## keep both SARS-CoV-2 Positive tests and negative tests
covid_tests_df = spark.sql("""SELECT * FROM {}""".format(read_table_name))

## merge pat_id + instance
covid_tests_df = covid_tests_df.dropDuplicates(["pat_id"]) \
  .withColumn('patient_id', F.concat(F.col('instance'), F.col('pat_id')))

In [0]:
#######################################################################################
## Notice: Ask Jenn or Yeon Mi to confirm whether should use the 2008 or 2016 version
## follow the ceda pipeline to upgrade those tables
#######################################################################################
pat_who_scores_df = spark.sql("""SELECT * FROM rdp_phi_sandbox.hadlock_who_scores_{}""".format(who_score_table_version))

In [0]:
covid_wos = pat_who_scores_df.join(covid_tests_df, on = ["patient_id"], how = "inner").drop(covid_tests_df.patient_id)

## save the merged COVID tests and WHO scores table 
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_covidtests_wos2".format(file_date))
covid_wos.write.saveAsTable("rdp_phi_sandbox.qw_{}_covidtests_wos2".format(file_date))

In [0]:
covidPositive_first_contact_date_df = covid_wos.select("patient_id", "covidPositive_first_contact_date", "covidNegative_first_contact_date").dropDuplicates()
# covidPositive_first_contact_date_df.limit(5).toPandas()

In [0]:
########################################################
## Define utility for getting max or min date from array
## Input: list
########################################################
def get_list_value(l, val='max'):
  if ((l is None) or not(isinstance(l, list))):
    return None
  if (len(l) < 1):
    return None
  if (val == 'max'):
    return max(l)
  elif (val == 'min'):
    return min(l)
  else:
    return None
min_date_udf = F.udf(lambda x: get_list_value(x, val='min'), TimestampType())
max_date_udf = F.udf(lambda x: get_list_value(x, val='max'), TimestampType())

## Load the merged covid test results and WHO table
table_name = "rdp_phi_sandbox.qw_{}_covidtests_wos2".format(file_date)
select_cols = ['patient_id', 'record_dt', 'who_score', 'first_contact_date', 'encountertype', 'patientclass', 'results']
who_data_df = spark.sql("SELECT * FROM {}".format(table_name)) \
  .where(F.col('record_dt') >= F.date_sub('first_contact_date', 2)) \
  .select(*select_cols, min_date_udf(F.col('admission_dt')).alias('admission_dt'),
          min_date_udf(F.col('discharge_dt')).alias('discharge_dt')) \
.withColumn('encounter_duration',
            F.round((F.unix_timestamp('discharge_dt') - F.unix_timestamp('admission_dt'))/86400, 2)) \
.dropDuplicates()

## Get first encounter after contact date for each patient
partition_by_cols = ['patient_id', 'first_contact_date']
w = Window.partitionBy(*partition_by_cols).orderBy('record_dt') \
  .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
first_encounter_df = who_data_df \
  .select(*partition_by_cols, F.first('admission_dt').over(w).alias('first_admission_dt'),
          F.first('discharge_dt').over(w).alias('first_discharge_dt')) \
  .where(F.col('admission_dt') == F.col('first_admission_dt')).dropDuplicates()

## Get first, max and last WHO score for each patient
partition_by_cols = ['patient_id', 'first_contact_date', 'admission_dt', 'discharge_dt']
w = Window.partitionBy(*partition_by_cols).orderBy('record_dt') \
  .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
patient_who_data_df = first_encounter_df \
  .withColumnRenamed('first_admission_dt', 'admission_dt') \
  .withColumnRenamed('first_discharge_dt', 'discharge_dt') \
  .join(who_data_df, partition_by_cols, how='left') \
  .select(*partition_by_cols, 'encountertype', 'patientclass', 'results', 'encounter_duration',
          F.first('who_score').over(w).alias('who_score_first'),
          F.max('who_score').over(w).alias('who_score_max'),
          F.last('who_score').over(w).alias('who_score_last')) \
  .dropDuplicates()

## Get datetime where max WHO score was first reached
join_cols = ['patient_id', 'who_score', 'admission_dt', 'discharge_dt']
w = Window.partitionBy(join_cols).orderBy('record_dt') \
  .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
max_who_scores_df = patient_who_data_df \
  .select('patient_id', 'who_score_max', 'admission_dt', 'discharge_dt') \
  .withColumnRenamed('who_score_max', 'who_score') \
  .join(who_data_df.select(*join_cols, 'record_dt'), join_cols, how='inner') \
  .select(*join_cols, F.first('record_dt').over(w).alias('max_who_score_record_dt')) \
  .dropDuplicates()

## Merge to get final dataframe
join_cols = ['patient_id', 'admission_dt', 'discharge_dt']
who_score_summary_df = patient_who_data_df \
  .join(max_who_scores_df.select(*join_cols, 'max_who_score_record_dt'), join_cols, how='left') \
  .withColumn('days_to_max_who_score',
              F.round((F.unix_timestamp('max_who_score_record_dt') - F.unix_timestamp('admission_dt'))/86400, 2)) \
  .withColumn('days_hospitalized', F.col('encounter_duration')) \
  .withColumn('days_to_min_who_score', F.col('days_hospitalized') - F.col('days_to_max_who_score')) \
  .where(F.col('admission_dt') >= start_date )

## Merge back the covidPositive_first_contact_date column
who_score_summary_df = who_score_summary_df.join(covidPositive_first_contact_date_df, on = "patient_id", how = "left").drop(covidPositive_first_contact_date_df.patient_id)

# # Save the table
# spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))
# who_score_summary_df.write.saveAsTable("rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))

# ## refresh the table
# spark.sql("REFRESH TABLE rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))

# ## Print out the first 3 entries
# who_score_summary_df.limit(3).toPandas()

In [0]:
# # Save the table
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))
who_score_summary_df.write.saveAsTable("rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))

## refresh the table
spark.sql("REFRESH TABLE rdp_phi_sandbox.qw_{}_covid_wos_tests".format(file_date))

In [0]:
# who_score_summary_df.limit(10).toPandas()

## New definition of hospitalization
* patient status is Admission or discharged
* patient class is Inpatient (combined with knowledge in the adtevent table)
* COVID-test is positive
* First positive observation date - 3 days <= hospitalization admission date (make sure hospitalization is an outcome of covid)
* hospitalization admission date (make sure hospitalization is an outcome of covid) <= First positive observation date + 14 days

In [0]:
temp_df = spark.sql(
"""
SELECT
  e.pat_id,
  e.instance,
  e.pat_enc_csn_id,
  date(e.contact_date),
  e.admissiondatetime,
  e.encountertype,
  e.patientclass,
  e.patientstatus,
  lr.observationdatetime,
  lr.resultvalue,
  clean_string(lr.resultvalue) as result_cleaned,
  covid19_resultvalue_map(lr.resultvalue) as result_short,
  lr.flaggedas
FROM rdp_phi.encounter as e
JOIN (
  -- Join procedure orders for NAAT and PCR tests
  SELECT * FROM rdp_phi.procedureorders
  WHERE
    ordername LIKE '%NAAT%' OR
    ordername LIKE '%PCR%') as po
ON
  e.instance = po.instance AND
  e.pat_enc_csn_id = po.pat_enc_csn_id
JOIN (
  -- Join lab results to get results of COVID-19 tests
  SELECT * FROM rdp_phi.labresult
  WHERE commonname in ('SARS CORONAVIRUS 2 RNA ORD', 'POC SARS CORONAVIRUS 2 RNA ORD', 'SARS-COV-2 (COVID-19) QUAL PCR RESULT', 'COV19EX','NAA (COVID-19) (REF)')
or resultname in ('SARS CORONAVIRUS 2 RNA ORD', 'POC SARS CORONAVIRUS 2 RNA ORD', 'SARS-COV-2 (COVID-19) QUAL PCR RESULT', 'COV19EX','NAA (COVID-19) (REF)') ) as lr
ON
  po.instance = lr.instance AND
  po.order_proc_id = lr.order_proc_id
WHERE resultvalue IS NOT NULL
and e.CONTACT_DATE >= TIMESTAMP('${startdate.var}')
and e.CONTACT_DATE <= TIMESTAMP('${enddate.var}')
""")

In [0]:
## Get necessary columns from the adtevent table
adtevent_full_df = spark.sql("""select distinct PAT_ID, PAT_ENC_CSN_ID, INSTANCE, BASEPATIENTCLASS, PATIENTCLASS, EVENTTYPE, FIRSTINPATIENT, EVENTTIMESTAMP from rdp_phi.adtevent""")


## Based on checking actual table, there are many nulls can be fill using the Patientclass column (inpatient or outpatient)
from pyspark.sql import functions as F
 
## Create a new column with only inpatient and outpatient info
adtevent_new_df = adtevent_full_df.withColumn("patientclass_only_inpatients_outpatients", F.when(F.col("PATIENTCLASS").isin("Inpatient", "Outpatient"), F.col("PATIENTCLASS") )\
                                             .otherwise(F.lit(None)) )
 
## Use the new column to fill as many null as possible in the BASEPATIENTCLASS column
adtevent_new_df = adtevent_new_df.withColumn("New_patientclass", F.when( F.col("BASEPATIENTCLASS").isNull(),  F.col("patientclass_only_inpatients_outpatients") )\
                                            .otherwise(F.col("BASEPATIENTCLASS") ) )

## Use the logic of eventtype (admission or transfer in) and firstinpatient (yes) to fill as many as nulls again
## Create a new column with only eventtype.isin("Admission", "Transfer In") AND firstinpatient.isin("Y") are set to "Inpatient"
adtevent_new2_df = adtevent_new_df.withColumn("Admission_or_TransferIn_and_firstinpatientYes", F.when( (F.col("eventtype").isin("Admission", "Transfer In") ) & ( F.col("firstinpatient").isin("Y") ), "Inpatient" )\
                                             .otherwise(F.lit(None)) )

## Use the new column to fill as many null as possible in the BASEPATIENTCLASS column
adtevent_new2_df = adtevent_new2_df.withColumn("New_patientclass", F.when( F.col("New_patientclass").isNull(),  F.col("Admission_or_TransferIn_and_firstinpatientYes") )\
                                            .otherwise(F.col("New_patientclass") ) )


#####################################################
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
######################################################

## Only filter those patients are more confident to be inpatient 
## relax the selection to no need of limit to firstinpatient == Y
adtevent_inpatient_df = adtevent_new2_df.where( F.col("New_patientclass") == "Inpatient" ).dropDuplicates()

## Drop those "helping" columns
cols_to_drop = ("patientclass_only_inpatients_outpatients", "Admission_or_TransferIn_and_firstinpatientYes", "BASEPATIENTCLASS", "PATIENTCLASS",
                "PAT_ID", "INSTANCE")
adtevent_inpatient_df = adtevent_inpatient_df.drop(*cols_to_drop).dropDuplicates()

## Create a window to only take the first row of event time stamp as the first hospitalization date
w = Window.partitionBy("PAT_ENC_CSN_ID").orderBy(col("EVENTTIMESTAMP").asc())

adtevent_inpatient_Y_df = adtevent_inpatient_df.withColumn("row",row_number().over(w)) \
  .filter(col("row") == 1).drop("row")

## Drop those additional columns not needed for later left join
cols_to_drop = ("EVENTTYPE")
adtevent_new3_df = adtevent_inpatient_Y_df.drop(*cols_to_drop).dropDuplicates()

In [0]:
## Left join with the pat_enc_csn_id column
# hos_df = temp_df.join(adtevent_new3_df, (temp_df.pat_enc_csn_id == adtevent_new3_df.PAT_ENC_CSN_ID) & (temp_df.admissiondatetime == adtevent_new3_df.EVENTTIMESTAMP), how = "left").drop(adtevent_new3_df.PAT_ENC_CSN_ID)

## Checked, the row counts == distinct pat_enc code which means each id now only have one time record.
hos_df = temp_df.join(adtevent_new3_df, (temp_df.pat_enc_csn_id == adtevent_new3_df.PAT_ENC_CSN_ID), how = "left").drop(adtevent_new3_df.PAT_ENC_CSN_ID)

# temp2_df.limit(50).toPandas()

In [0]:
patient_status_list = ['Admission', 'Discharged']

hos2_df = hos_df.withColumn("hos_patient_class", F.when( ( (F.col("patientclass") == "Inpatient")&(F.col("patientstatus").isin(patient_status_list)) )|(F.col("new_patientclass") == "Inpatient"), "Inpatient")\
                           .otherwise(F.lit(None)) )

In [0]:
hos2_df = hos2_df.withColumn("hos_earliest_time", F.when( (F.col("hos_patient_class") == "Inpatient") & ( (F.col("admissiondatetime").isNull() )|(F.col("admissiondatetime") > F.col("EVENTTIMESTAMP")) ), F.col("EVENTTIMESTAMP") )\
                                             .when((F.col("hos_patient_class") == "Inpatient"),  F.col("admissiondatetime"))\
                                             .otherwise(F.lit(None)) )

In [0]:
# display(hos2_df)

In [0]:
## Overwrite the previous hos dataframe
hos_df = hos2_df.select("*")

## Change to use the new patientclass column instead of the old one
## no need to limit to only firstinpatient == "Y"
hos_df = hos_df.withColumn("hospitalized_pos",\
                                              when((hos_df.result_short == 'positive')\
                                                    # ( (hos_df.patientstatus.isin(patient_status_list)) | (hos_df.FIRSTINPATIENT == "Y") )\
                                                    & (hos_df.hos_patient_class == 'Inpatient'),'yes')\
                                               .otherwise('no'))

In [0]:
from pyspark.sql.functions import min, first

###Find the minimum positive test time
hos_df = hos_df.withColumn("pos_observationdatetime",\
                       when(hos_df.result_short == 'positive', hos_df.observationdatetime)\
                       .otherwise(F.lit(None)))

## Notice the reason to use first() instead of min() is because the min function of spark will not ignore NULL value, and first will with ignorenulls = True
## In SQL it won't have a question cause, the MIN, MAX in sql will always exclude NULL
w = Window.partitionBy(['pat_id','instance']).orderBy("pos_observationdatetime").rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
hos_df = hos_df.withColumn("min_positive_obs", first('pos_observationdatetime', ignorenulls=True).over(w))

## Our definition of hospitalization is that admission date is within [min_positive - 3 , min_positive + 14]
## Date addition use function: date_add
## Date subtraction use function: date_sub

## Check if the earliest hospitalization date is within window or not
hos_df = hos_df.withColumn("hos_before", when((hos_df.hospitalized_pos == "yes") &\
                                              (F.date_sub(hos_df.min_positive_obs, 3) <= hos_df.hos_earliest_time) &\
                                              (hos_df.hos_earliest_time <= F.date_add(hos_df.min_positive_obs, 14)),"In")\
                       .otherwise("Out"))


hos_df = hos_df.withColumn("hospitalized_after_positive", when((hos_df.hospitalized_pos == "yes") & (hos_df.hos_before == "In"), F.lit(1))\
                       .otherwise(F.lit(0)))

In [0]:
## merge pat_id + instance
hos_df = hos_df.withColumn('patient_id', F.concat(F.col('instance'), F.col('pat_id')))

keep_cols = ["patient_id", "hospitalized_after_positive"]
hospitalization_pat_id_df = hos_df.select(*keep_cols).dropDuplicates()

In [0]:
# hospitalization_pat_id_df.limit(10).toPandas()

In [0]:
# hospitalization_pat_id_df.count()

In [0]:
# hospitalization_pat_id_df.select('patient_id', 'hospitalized_after_positive').dropDuplicates().groupBy('hospitalized_after_positive').count().toPandas()

In [0]:
# who_score_summary_df.count()

In [0]:
## Merge back the hospitalized_after_positive column
# who_score_summary_df = who_score_summary_df.join(hospitalization_pat_id_df, on = "patient_id", how = "left").drop(hospitalization_pat_id_df.patient_id)

who_score_summary_df_new = who_score_summary_df.join(hospitalization_pat_id_df, on = "patient_id", how = "inner").drop(hospitalization_pat_id_df.patient_id)

In [0]:
## Save the table
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_covid_wos_tests_hos".format(file_date))
who_score_summary_df_new.write.saveAsTable("rdp_phi_sandbox.qw_{}_covid_wos_tests_hos".format(file_date))

## refresh the table
spark.sql("REFRESH TABLE rdp_phi_sandbox.qw_{}_covid_wos_tests_hos".format(file_date))

## Print out the first 3 entries
# who_score_summary_df.limit(5).toPandas()

In [0]:
table_name = "qw_{}_covid_wos_tests_hos".format(file_date)
covid_wos_tests_df = spark.sql("SELECT * FROM rdp_phi_sandbox.{}".format(table_name))

## maybe need cleanup?
# covid_wos_tests_df.groupBy("results").count().show()

In [0]:
from pyspark.sql.functions import substring, length, col, expr
covid_wos_tests_df = covid_wos_tests_df.withColumn("pat_id", expr("substring(patient_id, 5, 20)"))
## sanity check
# covid_wos_tests_df.limit(3).toPandas()

In [0]:
covid_wos_tests_df = covid_wos_tests_df.withColumn("decided_index_date", when(covid_wos_tests_df.covidPositive_first_contact_date.isNull(), covid_wos_tests_df.covidNegative_first_contact_date)
                                                   .otherwise(covid_wos_tests_df.covidPositive_first_contact_date)
  )
## sanity check
# covid_wos_tests_df.limit(3).toPandas()

### COVID19 + WHO Scores Task completed :)
-> Next: get patient and patient race table

In [0]:
%run
"/Users/qi.wei1@providence.org/working_folder/IMIDs-COVID19-projects/related_libraries/qw_mainEHR(Patient_Patientrace)"

In [0]:
## Join pos_outcomes and patient tables
df1 = pats.join(covid_wos_tests_df, on=["pat_id"], how = 'left').drop(pats.instance)

## remove duplicates that were added after joining patient tables
df1 = df1.drop_duplicates(['pat_id'])

## Derived columns: 'age' in years from birthdate
## Using the "decided_index_date" to decide the patients' age at first positive or negative date
from pyspark.sql.functions import datediff, to_date, lit
covid_pats  = df1.withColumn("age", F.bround(datediff(col("decided_index_date"), col("birthdate"))/365))
covid_pats = covid_pats.filter(col("results") != "Unknown")

## age_ranges from age
from pyspark.sql.functions import udf
age_range = udf(lambda age:'0-17' if(age >=0 and age < 18) else
                           '18-49'if (age >= 18 and age < 50) else
                           '50-74' if (age >= 50 and age < 75) else
                           '75+' if (age >= 75) else '')
covid_pats  = covid_pats.withColumn('age_range', age_range(covid_pats.age))
# drop_cols = ['birthdate', 'date_of_death']
drop_cols = ['birthdate']
covid_pats = covid_pats.drop(*drop_cols)
# print('covid_pats with all ages: {}'.format(covid_pats.count()))

######################################################
## After comment we decided to use all age patients
######################################################
## get only adult patients
# covid_adult_pats = covid_pats.filter(col("age") >= "18")
# print('covid_pats >=18 years: {}'.format(covid_adult_pats.count()))
# covid_adult_pats.limit(5).toPandas()

In [0]:
## Save the adults table
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_all_age_patients_covid_wos_tests".format(file_date))
covid_pats.write.saveAsTable("rdp_phi_sandbox.qw_{}_all_age_patients_covid_wos_tests".format(file_date))

In [0]:
# covid_pats.groupBy(col("results")).count().show()

In [0]:
# covid_pats.groupBy(col("results"), col("who_score_max")).count().orderBy("who_score_max", "results").show()

### Cleanup covid_adult_tests in R
- create csv

In [0]:
covid_pats = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_{}_all_age_patients_covid_wos_tests""".format(file_date))
# cols = ['encountertype', 'patientclass','max_who_score_record_dt']
cols = ['encountertype', 'patientclass']
covid_pats = covid_pats.drop(*cols)

from pyspark.sql.functions import countDistinct
## Print out the number of patient counts for each results
covid_pats.groupBy('results').agg(countDistinct('pat_id')).toPandas()

Unnamed: 0,results,count(pat_id)
0,Positive,187934
1,Negative,1458900


In [0]:
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_{}_all_age_covid_patients2".format(file_date))
covid_pats.write.saveAsTable("rdp_phi_sandbox.qw_{}_all_age_covid_patients2".format(file_date))

In [0]:
# covid_pats.limit(5).toPandas()

In [0]:
# import pyspark.sql.functions as func
# covid_pats.groupBy('pat_id').agg(func.countDistinct('race')).orderBy('count(race)', ascending=False).show()

### Create a clean adult patients covid tests and use this for downstream analysis

In [0]:
########################
## Pre-processing
######################
## Read dataframe
covid_pats = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_{}_all_age_covid_patients2""".format(file_date))

##########################################################################
## Instead of omitting Unknown, Other and None values of sex
## We group them into a new Unkonw class together, which means missing
#########################################################################
#Replace part of string with another string
from pyspark.sql.functions import regexp_replace

# covid_pats = covid_pats.filter(col("sex") != "Unknown").filter(col("sex") != "Other").filter(col("sex") != "None")
covid_pats = covid_pats.withColumn("sex", regexp_replace("sex", "Other", "Unknown"))
## Fill all None values with Unknown
covid_pats = covid_pats.fillna(value="Unknown",subset=["sex"])

## Ethnicity was mapped using both ethnicgroup and race columns
## source: https://www.census.gov/newsroom/blogs/random-samplings/2021/08/measuring-racial-ethnic-diversity-2020-census.html
from pyspark.sql.functions import col, when, to_date

## normalize ethnicity
## Notice: Here is the place to separate Unknown and Other race definitions
## Fill all None values with Unknown
covid_pats = covid_pats.fillna(value="Unknown",subset=["race"])
covid_pats = covid_pats.fillna(value="Unknown",subset=["ethnicgroup"])

covid_pats = covid_pats.withColumn("ethnicity", 
                                     when((col("ethnicgroup") == "Hispanic or Latino") & (col("race") == "Other"), "Hispanic")\
                                     .when((col("ethnicgroup") == "Hispanic or Latino") & (col("race") == "Patient Refused"), "Hispanic")\
                                     .when((col("ethnicgroup") == "Hispanic or Latino") & (col("race") == "Unknown"), "Hispanic")\
                                     .when((col("ethnicgroup") == "Hispanic or Latino") & (col("race") == "White or Caucasian"), "Hispanic")\
                                     .when((col("ethnicgroup") == "None") & (col("race") == "Other"), "Unknown")\
                                     .when((col("ethnicgroup") == "Patient Refused") & (col("race") == "Other"), "Unknown")\
                                     .when((col("ethnicgroup") == "Unknown") & (col("race") == "Other"), "Unknown")\
                                     .when((col("ethnicgroup") == "None") & (col("race") == "Patient Refused"), "Unknown")\
                                     .when((col("ethnicgroup") == "Patient Refused") & (col("race") == "Patient Refused"), "Unknown")\
                                     .when((col("ethnicgroup") == "Unknown") & (col("race") == "Patient Refused"), "Unknown")\
                                     .when((col("ethnicgroup") == "None") & (col("race") == "Unknown"), "Unknown")\
                                     .when((col("ethnicgroup") == "Patient Refused") & (col("race") == "Unknown"), "Unknown")\
                                     .when((col("ethnicgroup") == "Unknown") & (col("race") == "Unknown"), "Unknown").otherwise("Not Hispanic")).drop(covid_pats.ethnicgroup)
## normalize race
## figure out how to do mixed races
covid_pats = covid_pats.withColumn("race1",
                                     when((col("race") == "American Indian or Alaska Native"), "AIAN")\
                                     .when((col("race") == "Asian"), "Asian")\
                                     .when((col("race") == "Black or African American"), "Black")\
                                     .when((col("race") == "Native Hawaiian or Other Pacific Islander"), "NHPI")\
                                     .when((col("race") == "Other"), "Other")\
                                     ## Separate the unknown as an independant race factor
                                     .when((col("race") == "Patient Refused"), "Unknown")\
                                     .when((col("race") == "Unknown"), "Unknown")\
                                     .when((col("race") == "White or Caucasian"), "White")).drop(covid_pats.race)

## normalize race Other using ethnicity                     
covid_pats = covid_pats.withColumn("race_v2",
                                     when((col("race1") == "Other") & (col("ethnicity") == "Hispanic"), "Other Hispanic")\
                                     .when((col("race1") == "Other") & (col("ethnicity") == "Not Hispanic"), "Other Not Hispanic")\
                                     .when((col("race1") == "Other") & (col("ethnicity") == "Hispanic"), "Unknown Hispanic")\
                                     .when((col("race1") == "Other") & (col("ethnicity") == "Not Hispanic"), "Unknown Not Hispanic").otherwise(col("race1")))
                                     
## combine ethnicity and race
covid_pats = covid_pats.withColumn("ethnicity_race",
                                     when((col("race_v2") == "White") & (col("ethnicity") == "Hispanic"), "Hispanic White")\
                                     .when((col("race_v2") == "White") & (col("ethnicity") == "Not Hispanic"), "Not Hispanic White")\
                                     .when((col("race_v2") == "Black") & (col("ethnicity") == "Not Hispanic"), "Not Hispanic Black")\
                                     .when((col("race_v2") == "AIAN") & (col("ethnicity") == "Not Hispanic"), "Not Hispanic AIAN")\
                                     .when((col("race_v2") == "Asian") & (col("ethnicity") == "Not Hispanic"), "Not Hispanic Asian")\
                                     .when((col("race_v2") == "NHPI") & (col("ethnicity") == "Not Hispanic"), "Not Hispanic NHPI")\
                                     .when((col("race_v2") == "Other") & (col("ethnicity") == "Unknown"), "Unknown Other").otherwise(col("race_v2")))

## Patient outcomes
## hospitalized
## Old one for posters
# adult_covid = adult_covid.withColumn("hospitalized",
#                                      when((col("who_score_max") <= 3), "no")\
#                                      .when(col("days_hospitalized").isNull(), "no")\
#                                      .otherwise("yes"))

## Sevda's hospitalized definition
## Now use the "hospitalized_after_positive" column

###################################################
## Invasive mechanical ventilation (IMV)
## Now use the "IMV_after_positive" column
###################################################
## Old, need cleanup
# covid_pats = covid_pats.withColumn("invasive_mechanical_vent",\
#                                      when((col("who_score_max") == 6) | (col("who_score_max") ==7), "yes")\
#                                      .when(col("who_score_max").isNull(), "no")\
#                                      .otherwise("no"))

covid_pats = covid_pats.withColumn("max_who_date",to_date("max_who_score_record_dt"))
covid_pats = covid_pats.withColumn("IMV_after_positive",\
                                     when( ( (col("results") == "Positive") & ((col("who_score_max") == 6) | (col("who_score_max") ==7)) &\
                                            (col("max_who_date")>=col("covidPositive_first_contact_date")) & (col("max_who_date")<= F.date_add(col("covidPositive_first_contact_date"), 30) )\
                                           ), F.lit(1))\
                                     .when(col("who_score_max").isNull(), F.lit(0))\
                                     .otherwise(F.lit(0)))

####################################################
## death
## Now use the "death_after_positive" column
#####################################################
## general death, unrelated to covid-19 infection
## not used now
# covid_pats = covid_pats.withColumn("death",\
#                                      when((col("who_score_max") == 8), "yes")\
#                                      .when(col("who_score_max").isNull(), "no")\
#                                      .otherwise("no"))

covid_pats = covid_pats.withColumn("death_after_positive",\
                                     when( ( (col("results") == "Positive") &\
                                            ( ( (col("date_of_death")>=col("covidPositive_first_contact_date")) & (col("date_of_death")<= F.date_add(col("covidPositive_first_contact_date"), 30) ) )\
                                             | ( (col("who_score_max") == 8) &\
                                            (col("max_who_date")>=col("covidPositive_first_contact_date")) & (col("max_who_date")<= F.date_add(col("covidPositive_first_contact_date"), 30) ) ) )\
                                           ), F.lit(1))\
                                     .when(col("who_score_max").isNull(), F.lit(0))\
                                     .otherwise(F.lit(0)))

## select some columns and save df
## Old code, need cleanup
# cols = ['pat_id', 'patient_id', 'age', 'age_range', 'sex', 'ethnicity', 'race1', 'race_v2', 'ethnicity_race','first_contact_date','covidPositive_first_contact_date', 'admission_dt', 'discharge_dt', 'results', 'who_score_max', 'days_to_max_who_score', 'days_hospitalized', 'hospitalized', 'invasive_mechanical_vent', 'death', 'encounter_duration']

cols = ['pat_id', 'patient_id', 'age', 'age_range', 'sex', 'ethnicity', 'race1', 'race_v2', 'ethnicity_race','first_contact_date','covidPositive_first_contact_date', 'decided_index_date', 'admission_dt', 'discharge_dt', 'results', 'who_score_max', 'days_to_max_who_score', 'days_hospitalized', 'hospitalized_after_positive', 'IMV_after_positive', 'death_after_positive', 'encounter_duration']
covid_pats = covid_pats.select(*cols)

# print('all COVID-19 cases : %s' % covid_pats.count())
# covid_pats.limit(10).toPandas()

spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_all_age_covid_patients_{}_wRaceEthnicity".format(file_date))
covid_pats.write.saveAsTable("rdp_phi_sandbox.qw_all_age_covid_patients_{}_wRaceEthnicity".format(file_date))

In [0]:
%run
"/Users/qi.wei1@providence.org/working_folder/IMIDs-COVID19-projects/related_libraries/Python"

### Filter out patient whose first encounter is for covid test, add bmi feature
*  all patients including those first encounter is only covid related, aka we don't have medication history
*  add BMI column in it

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Column
# from pyspark.sql.functions import *
from pyspark.sql.functions import col, when

## Input: height in ft unit
def convert_height(height):
  import re
  if height is None:
    return None
  else:
    ftin = re.findall(r"\d", height)
    inch = 12*int(ftin[0]) + int(ftin[1])
    meter = inch * 0.0254
    return meter

convert_height_udf = F.udf(lambda height: convert_height(height), DoubleType())

def aggregate_data(df, partition_columns, aggregation_columns, order_by=None):
  """Aggregate data over specified partition columns
  
  Parameters:
  df (PySpark): Dataframe to aggregate
  partition_columns (str or list): Field(s) in df on which to partition. If partitioning on only one
                                   column, the column name can be provided as a str rather than a
                                   list
  aggregation_columns (dict): Must be a dict where the keys are fields in df to aggregate and values
                              are either a str or list, specifying the aggregation functions to use.
                              If using only one aggregation function for a given field, the name of
                              the aggregation function can be provided as a str rather than a list.
                              A separate column will be added for each aggregation function.
  order_by (str or list): Field(s) in df to use for ordering records in each partition. If None, do
                          not order. If ordering on only one column, the column name can be provided
                          as a str rather than a list
  
  Result:
  PySpark df: Dataframe containing the aggregated results
  
  """
  # First argument must be a PySpark dataframe
  assert(isinstance(df, DataFrame))
  
  # Input dataframe must contain specified partition columns
  partition_columns = partition_columns if isinstance(partition_columns, list) else [partition_columns]
  assert(all([s in df.columns for s in partition_columns]))
    
  # Perform validity checks on aggregation_columns
  assert(isinstance(aggregation_columns, dict))
  assert(all([s in df.columns for s in list(aggregation_columns.keys())]))
  valid_agg_functions = ['avg', 'collect_list', 'collect_set', 'concat_ws', 'count', 'first', 'last', 'max', 'mean', 'median', 'min', 'stddev', 'sum']
  for k in list(aggregation_columns.keys()):
    v = aggregation_columns[k]
    aggregation_columns[k] = v if isinstance(v, list) else [v]
    assert(all([s in valid_agg_functions for s in aggregation_columns[k]]))
  
  # order_by (if not None) must contain valid column names
  if(not(order_by is None)):
    order_by = order_by if isinstance(order_by, list) else [order_by]
    assert(all([s in df.columns for s in order_by]))
  
  # Define partition window
  w = Window.partitionBy(partition_columns)
  if(not(order_by is None)):
    w = w.orderBy(order_by).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
  
  # Add aggregate columns
  results_df = df; new_columns = []
  for col, agg_func in aggregation_columns.items():
    for s in agg_func:
      # Check for boolean field (must be converted to 0/1 for some aggregation functions)
      bool_type = (dict(results_df.dtypes)[col] == 'boolean')
      
      # Apply aggregation function
      col_name = '_'.join([col, s])
      new_columns = new_columns + [col_name]
      print("Adding new column '{}'...".format(col_name))
      if(s in ['avg', 'mean']):
        if(bool_type):
          print("Casting boolean column '{}' to integer to calculate avg/mean...".format(col))
          results_df = results_df.withColumn(col_name, F.avg(F.col(col).cast(IntegerType())).over(w))
        else:
          results_df = results_df.withColumn(col_name, F.avg(col).over(w))
      elif(s == 'collect_list'):
        results_df = results_df.withColumn(col_name, F.collect_list(col).over(w))
      elif(s == 'collect_set'):
        results_df = results_df.withColumn(col_name, F.collect_set(col).over(w))
      elif(s == 'concat_ws'):
        results_df = results_df.withColumn(col_name, F.concat_ws(';', F.collect_list(col).over(w)))
      elif(s == 'count'):
        results_df = results_df.withColumn(col_name, F.count(col).over(w))
      elif(s == 'first'):
        results_df = results_df.withColumn(col_name, F.first(col).over(w))
      elif(s == 'last'):
        results_df = results_df.withColumn(col_name, F.last(col, ignorenulls = True).over(w))
      elif(s == 'max'):
        results_df = results_df.withColumn(col_name, F.max(col).over(w))
      elif(s == 'min'):
        results_df = results_df.withColumn(col_name, F.min(col).over(w))
      elif(s == 'median'):
        results_df = results_df.withColumn(col_name, median_udf(F.collect_list(col).over(w)))
      elif(s == 'stddev'):
        if(bool_type):
          print("Casting boolean column '{}' to integer to calculate stddev...".format(col))
          results_df = results_df.withColumn(col_name,
                                             F.stddev(F.col(col).cast(IntegerType())).over(w))
        else:
          results_df = results_df.withColumn(col_name, F.stddev(col).over(w))
      elif(s == 'sum'):
        if(bool_type):
          print("Casting boolean column '{}' to integer to calculate sum...".format(col))
          results_df = results_df.withColumn(col_name, F.sum(F.col(col).cast(IntegerType())).over(w))
        else:
          results_df = results_df.withColumn(col_name, F.sum(col).over(w))
  
  # Process the final dataframe for return
  final_columns = partition_columns + new_columns
  results_df = results_df.select(final_columns).dropDuplicates()
    
  return results_df

In [0]:
from pyspark.sql import DataFrame

#########################################################
# Generate table that needed for step 4
########################################################
enc_field = "pat_id, instance, contact_date, weight, height"
enc_query = "SELECT %s FROM rdp_phi.encounter WHERE instance ='1000'" %enc_field
df_enc = spark.sql(enc_query) 

## get adult_covid df
covid_pats = spark.sql("""SELECT * FROM rdp_phi_sandbox.qw_all_age_covid_patients_{}_wRaceEthnicity""".format(file_date))

## only select needed cols
cols = ['pat_id', 'patient_id', 'age', 'age_range', 'sex', 'ethnicity', 'race1', 'race_v2', 'ethnicity_race', 'results', 'hospitalized_after_positive', 'IMV_after_positive', 'death_after_positive', 'covidPositive_first_contact_date', 'decided_index_date']  

covid_pats = covid_pats.select(*cols).dropDuplicates()

## inner join df_enc + covid_pats
temp_join_df = df_enc.join(covid_pats, on = ["pat_id"], how = "inner") \
                    .withColumn('e_weight', F.col('weight')*0.0283495) \
                    .withColumn('e_height', convert_height_udf(F.col('height'))) \
                    .drop('weight', 'height')

temp_join_df = temp_join_df.orderBy("contact_date")
partition_by = ['pat_id','instance']
aggregate_by = {'e_weight' : 'last', 
                'e_height' : 'last'}

agg_df = aggregate_data(temp_join_df, partition_columns = partition_by , aggregation_columns = aggregate_by)
agg_df = agg_df.withColumnRenamed('e_weight_last', 'weight')\
               .withColumnRenamed('e_height_last', 'height')\
               .withColumn('BMI', F.col('weight')/F.col('height')**2)

## Fill the null values (around 6%) to be all median values
## use the imputer function
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['BMI'],
    outputCols = ["{}_imputed".format(a) for a in ['BMI']]
).setStrategy("median")

agg_impute_df = imputer.fit(agg_df).transform(agg_df)

## Notice!!!
## Temp comment out below line just to get the missingness on BMI
agg_impute_df = agg_impute_df.drop("BMI").withColumnRenamed("BMI_imputed", "BMI")

## left join adult_covid + agg_impute_df
df_enc_covid = temp_join_df.join(agg_impute_df, on = ["pat_id"], how = 'left')

## Use decided_index_date or first_contact_date?
df_enc_covid = df_enc_covid.withColumn("contactdate_before_covid19date", when((col("contact_date")) < (col("decided_index_date")), (col("contact_date"))))
      

cols = ['pat_id', 'patient_id', 'age', 'age_range', 'sex', 'ethnicity', 'race1', 'race_v2', 'ethnicity_race', 'BMI', 'results', 'hospitalized_after_positive', 'IMV_after_positive', 'death_after_positive', 'contactdate_before_covid19date', 'covidPositive_first_contact_date', 'decided_index_date']

## Only select those cols we want
df_enc_covid = df_enc_covid.select(*cols).dropDuplicates()

# table_name = "rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity".format(file_date)
# spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity".format(file_date))
# df_enc_covid.write.saveAsTable(table_name)

In [0]:
# table_name = "rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity".format(file_date)
# df_enc_covid = spark.sql("""SELECT * from rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity""".format(file_date))

# ## check the df
# # display(df_enc_covid)

In [0]:
# from pyspark.sql.functions import col,isnan,when,count
# search_col=["contactdate_before_covid19date"]
# # print("The total number of patient+encounter records are: ", df_enc_covid.count())

# # print("The encounter number of patients have first contact just due to COVID (those with no contactdate_before_covid19date) are: ")
# # df_enc_covid.select([count(when(col(c).isNull(), c)).alias(c) for c in search_col]).show()

In [0]:
from pyspark.sql.functions import col,isnan,when,count
search_col=["contactdate_before_covid19date"]

df_enc_covid_recurrent_patient_only = df_enc_covid.na.drop(subset=["contactdate_before_covid19date"])
# print("The total number of patient+encounter records now is: ", df_enc_covid_recurrent_patient_only.count())

########################################################################
## Notice! Sanity check, notice this table should have 0 rows count now
########################################################################
print("The current encounter number of patients have first contact just due to COVID (those with no contactdate_before_covid19date) are: ")
df_enc_covid_recurrent_patient_only.select([count(when(col(c).isNull(), c)).alias(c) for c in search_col]).show()

print("The current number of unique patients after filtering out first time COVID patients is: ")

df_enc_covid_recurrent_patient_only.select("pat_id").dropDuplicates().count()

In [0]:
# # print("Missing rate:")
# # print(agg_impute_df.select(col('BMI')).filter(col('BMI').isNull()).count()/agg_impute_df.count()*100)

# ## Only positive or negative patients
# df_BMI_pos_neg = df_enc_covid_recurrent_patient_only.filter( (df_enc_covid_recurrent_patient_only.results == "Positive") | (df_enc_covid_recurrent_patient_only.results == "Negative") )

# ## Only positive
# df_BMI_pos = df_enc_covid_recurrent_patient_only.filter( (df_enc_covid_recurrent_patient_only.results == "Positive"))

# print("Missing tested pts:")
# print(df_BMI_pos_neg.select('pat_id', 'BMI').filter(col('BMI').isNull()).select('pat_id').distinct().count())

# print("Missing positive pts:")
# print(df_BMI_pos.select('pat_id', 'BMI').filter(col('BMI').isNull()).select('pat_id').distinct().count())

In [0]:
# df_enc_covid_recurrent_patient_only.limit(5).toPandas()

In [0]:
table_name = "rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity_nOneTimeCovidPatient_wFirstPosDate".format(file_date)
spark.sql("DROP TABLE IF EXISTS rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity_nOneTimeCovidPatient_wFirstPosDate".format(file_date))

## Need to refresh the table to avoid issue
df_enc_covid_recurrent_patient_only.write.saveAsTable(table_name)

spark.sql("REFRESH TABLE rdp_phi_sandbox.qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity_nOneTimeCovidPatient_wFirstPosDate".format(file_date))

In [0]:
# df_enc_covid_recurrent_patient_only.count()

## Task completed, continue to step 2
#### /Users/jennifer.hadlock2@providence.org/GreenerGrass/Qi/Transfer-Qi/COVID-IBD-IMIDs-paper/IMIDs_working_in_progress/Step2_generate_IBD_OtherIMIDs_cohort

In [0]:
######################################################
# The following code are for additional analysis
######################################################
# print("Total number of records in the qw_all_age_covid_patients_{}_wEncounter_wRaceEthnicity table: ".format(file_date), df_enc_covid.count())
print("Task completed, continue to step 2")