In [1]:
import os
import pandas as pd
import numpy as np
import pyspark 

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F 
from pyspark.sql.types import StringType
from pyspark.sql import dataframe

In [3]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
SparkContext.setSystemProperty('spark.executor.memory', '3g')
SparkContext.setSystemProperty('spark.driver.memory', '3g')

In [5]:
spark._sc.getConf().getAll()

[('spark.executor.memory', '3g'),
 ('spark.driver.memory', '3g'),
 ('spark.driver.port', '62736'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1582567966933'),
 ('spark.driver.host', '192.168.1.82'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

### Codes for Chemotherapy and medication
   - ICD-9-CM Diagnosis Code: V58.11.  
   - ICD-9-Procedure Code:992.5 *(Injection or infusion of cancer chemotherapeutic substance)*.
   - HCPCS includes three separate levels of codes:  
        > Level I: the AMA’s CPT codes and is numeric.  
        > Level II: HCPCS alphanumeric code set and primarily include non-physician products, supplies, and procedures not included in CPT.  
        > Level III: also called HCPCS local codes, were developed by state Medicaid agencies, Medicare contractors, and private insurers for use in specific programs and jurisdictions.    
        
     HCPCS codes for Chemotherapy Drugs:J9000-J9999.  
     [HCPCS codes summary](https://www.hcpro.com/HIM-284009-8160/Note-similarities-and-differences-between-HCPCS-CPT-codes.html)  
     [HCPCS codes1](https://coder.aapc.com/hcpcs-codes-range/10)
     [HCPCS codes2](https://hcpcs.codes/j-codes/)
   - CPT codes:
     Chemotherapy administration codes, **96400 through 96450, 96542, 96545, and 96549**, are only to be used when reporting chemotherapy administration when the drug being used is an anti-neoplastic and the diagnosis is cancer. The administration of other drugs, such as growth factors, saline, and diuretics, to patients with cancer, or the administration of antineoplastics to patients with a diagnosis other than cancer, are reported with codes 90780 through 90784 as appropriate. 

In [6]:
cpt_lst =list(range(96400,96451))+[96542, 96545,96549,90780,90784]

## 1. Sample (Outpatient)

In [6]:
out_OC = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/outpatient/outpatient_oc_feature/")

In [7]:
out_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = true)
 |-- CLM_ID: long (nullable = true)
 |-- PRVDR_NUM: string (nullable = true)
 |-- ICD9_DGNS_CD_1: string (nullable = true)
 |-- ICD9_DGNS_CD_2: string (nullable = true)
 |-- ICD9_DGNS_CD_3: string (nullable = true)
 |-- ICD9_DGNS_CD_4: string (nullable = true)
 |-- ICD9_DGNS_CD_5: string (nullable = true)
 |-- ICD9_DGNS_CD_6: string (nullable = true)
 |-- ICD9_DGNS_CD_7: string (nullable = true)
 |-- ICD9_DGNS_CD_8: string (nullable = true)
 |-- ICD9_DGNS_CD_9: string (nullable = true)
 |-- ICD9_DGNS_CD_10: string (nullable = true)
 |-- ICD9_PRCDR_CD_1: integer (nullable = true)
 |-- ICD9_PRCDR_CD_2: string (nullable = true)
 |-- ICD9_PRCDR_CD_3: string (nullable = true)
 |-- ICD9_PRCDR_CD_4: string (nullable = true)
 |-- ICD9_PRCDR_CD_5: string (nullable = true)
 |-- ICD9_PRCDR_CD_6: string (nullable = true)
 |-- ADMTNG_ICD9_DGNS_CD: string (nullable = true)
 |-- HCPCS_CD_1: string (nullable = true)
 |-- HCPCS_CD_2: string (nullable = tr

In [8]:
out_OC.count()

16706

In [9]:
out_OC.ICD9_PRCDR_CD_1 = out_OC.ICD9_PRCDR_CD_1.cast('string')

In [10]:
from pyspark.sql.types import StringType

In [11]:
out_OC = out_OC.withColumn("ICD9_PRCDR_CD_1", out_OC["ICD9_PRCDR_CD_1"].cast(StringType()))

In [17]:
out_OC = out_OC.fillna("0")

In [18]:
out_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = false)
 |-- CLM_ID: long (nullable = true)
 |-- PRVDR_NUM: string (nullable = false)
 |-- ICD9_DGNS_CD_1: string (nullable = false)
 |-- ICD9_DGNS_CD_2: string (nullable = false)
 |-- ICD9_DGNS_CD_3: string (nullable = false)
 |-- ICD9_DGNS_CD_4: string (nullable = false)
 |-- ICD9_DGNS_CD_5: string (nullable = false)
 |-- ICD9_DGNS_CD_6: string (nullable = false)
 |-- ICD9_DGNS_CD_7: string (nullable = false)
 |-- ICD9_DGNS_CD_8: string (nullable = false)
 |-- ICD9_DGNS_CD_9: string (nullable = false)
 |-- ICD9_DGNS_CD_10: string (nullable = false)
 |-- ICD9_PRCDR_CD_1: string (nullable = false)
 |-- ICD9_PRCDR_CD_2: string (nullable = false)
 |-- ICD9_PRCDR_CD_3: string (nullable = false)
 |-- ICD9_PRCDR_CD_4: string (nullable = false)
 |-- ICD9_PRCDR_CD_5: string (nullable = false)
 |-- ICD9_PRCDR_CD_6: string (nullable = false)
 |-- ADMTNG_ICD9_DGNS_CD: string (nullable = false)
 |-- HCPCS_CD_1: string (nullable = false)
 |-- HCPCS_CD_2: st

In [79]:
OC_che = out_OC.filter((out_OC.ICD9_DGNS_CD_1 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_2 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_3 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_4 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_5 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_6 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_7 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_8 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_9 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_10 == "V5811")|
                       (out_OC.ICD9_PRCDR_CD_1== "9925")|
                       (out_OC.ICD9_PRCDR_CD_2== "9925")|
                       (out_OC.ICD9_PRCDR_CD_3== "9925")|
                       (out_OC.ICD9_PRCDR_CD_4== "9925")|
                       (out_OC.ICD9_PRCDR_CD_5== "9925")|
                       ((out_OC.HCPCS_CD_1.isin (cpt_lst))|out_OC.HCPCS_CD_1.startswith("J9"))|
                       ((out_OC.HCPCS_CD_2.isin (cpt_lst))|out_OC.HCPCS_CD_2.startswith("J9"))|
                       ((out_OC.HCPCS_CD_3.isin (cpt_lst))|out_OC.HCPCS_CD_3.startswith("J9"))|
                       ((out_OC.HCPCS_CD_4.isin (cpt_lst))|out_OC.HCPCS_CD_4.startswith("J9"))|
                       ((out_OC.HCPCS_CD_5.isin (cpt_lst))|out_OC.HCPCS_CD_5.startswith("J9"))|
                       ((out_OC.HCPCS_CD_6.isin (cpt_lst))|out_OC.HCPCS_CD_6.startswith("J9"))|
                       ((out_OC.HCPCS_CD_7.isin (cpt_lst))|out_OC.HCPCS_CD_7.startswith("J9"))|
                       ((out_OC.HCPCS_CD_8.isin (cpt_lst))|out_OC.HCPCS_CD_8.startswith("J9"))|
                       ((out_OC.HCPCS_CD_9.isin (cpt_lst))|out_OC.HCPCS_CD_9.startswith("J9"))|
                       ((out_OC.HCPCS_CD_10.isin (cpt_lst))|out_OC.HCPCS_CD_10.startswith("J9"))|
                       ((out_OC.HCPCS_CD_11.isin (cpt_lst))|out_OC.HCPCS_CD_11.startswith("J9"))|
                       ((out_OC.HCPCS_CD_12.isin (cpt_lst))|out_OC.HCPCS_CD_12.startswith("J9"))|
                       ((out_OC.HCPCS_CD_13.isin (cpt_lst))|out_OC.HCPCS_CD_13.startswith("J9"))|
                       ((out_OC.HCPCS_CD_14.isin (cpt_lst))|out_OC.HCPCS_CD_14.startswith("J9"))|
                       ((out_OC.HCPCS_CD_15.isin (cpt_lst))|out_OC.HCPCS_CD_15.startswith("J9"))|
                       ((out_OC.HCPCS_CD_16.isin (cpt_lst))|out_OC.HCPCS_CD_16.startswith("J9"))|
                       ((out_OC.HCPCS_CD_17.isin (cpt_lst))|out_OC.HCPCS_CD_17.startswith("J9"))|
                       ((out_OC.HCPCS_CD_18.isin (cpt_lst))|out_OC.HCPCS_CD_18.startswith("J9"))|
                       ((out_OC.HCPCS_CD_19.isin (cpt_lst))|out_OC.HCPCS_CD_19.startswith("J9"))|
                       ((out_OC.HCPCS_CD_20.isin (cpt_lst))|out_OC.HCPCS_CD_20.startswith("J9"))|
                       ((out_OC.HCPCS_CD_21.isin (cpt_lst))|out_OC.HCPCS_CD_21.startswith("J9"))|
                       ((out_OC.HCPCS_CD_22.isin (cpt_lst))|out_OC.HCPCS_CD_22.startswith("J9"))|
                       ((out_OC.HCPCS_CD_23.isin (cpt_lst))|out_OC.HCPCS_CD_23.startswith("J9"))|
                       ((out_OC.HCPCS_CD_24.isin (cpt_lst))|out_OC.HCPCS_CD_24.startswith("J9"))|
                       ((out_OC.HCPCS_CD_25.isin (cpt_lst))|out_OC.HCPCS_CD_25.startswith("J9"))|
                       ((out_OC.HCPCS_CD_26.isin (cpt_lst))|out_OC.HCPCS_CD_26.startswith("J9"))|
                       ((out_OC.HCPCS_CD_27.isin (cpt_lst))|out_OC.HCPCS_CD_27.startswith("J9"))|
                       ((out_OC.HCPCS_CD_28.isin (cpt_lst))|out_OC.HCPCS_CD_28.startswith("J9"))|
                       ((out_OC.HCPCS_CD_29.isin (cpt_lst))|out_OC.HCPCS_CD_29.startswith("J9"))|
                       ((out_OC.HCPCS_CD_30.isin (cpt_lst))|out_OC.HCPCS_CD_30.startswith("J9"))|
                       ((out_OC.HCPCS_CD_31.isin (cpt_lst))|out_OC.HCPCS_CD_31.startswith("J9"))|
                       ((out_OC.HCPCS_CD_32.isin (cpt_lst))|out_OC.HCPCS_CD_32.startswith("J9"))|
                       ((out_OC.HCPCS_CD_33.isin (cpt_lst))|out_OC.HCPCS_CD_33.startswith("J9"))|
                       ((out_OC.HCPCS_CD_34.isin (cpt_lst))|out_OC.HCPCS_CD_34.startswith("J9"))|
                       ((out_OC.HCPCS_CD_35.isin (cpt_lst))|out_OC.HCPCS_CD_35.startswith("J9"))|
                       ((out_OC.HCPCS_CD_36.isin (cpt_lst))|out_OC.HCPCS_CD_36.startswith("J9"))|
                       ((out_OC.HCPCS_CD_37.isin (cpt_lst))|out_OC.HCPCS_CD_37.startswith("J9"))|
                       ((out_OC.HCPCS_CD_38.isin (cpt_lst))|out_OC.HCPCS_CD_38.startswith("J9"))|
                       ((out_OC.HCPCS_CD_39.isin (cpt_lst))|out_OC.HCPCS_CD_39.startswith("J9"))|
                       ((out_OC.HCPCS_CD_40.isin (cpt_lst))|out_OC.HCPCS_CD_40.startswith("J9"))|
                       ((out_OC.HCPCS_CD_41.isin (cpt_lst))|out_OC.HCPCS_CD_41.startswith("J9"))|
                       ((out_OC.HCPCS_CD_42.isin (cpt_lst))|out_OC.HCPCS_CD_42.startswith("J9"))|
                       ((out_OC.HCPCS_CD_43.isin (cpt_lst))|out_OC.HCPCS_CD_43.startswith("J9"))|
                       ((out_OC.HCPCS_CD_44.isin (cpt_lst))|out_OC.HCPCS_CD_44.startswith("J9"))|
                       ((out_OC.HCPCS_CD_45.isin (cpt_lst))|out_OC.HCPCS_CD_45.startswith("J9"))
                       )
 

In [88]:
out_OC = out_OC.withColumn('chem_class',(F.when(((out_OC.ICD9_DGNS_CD_1 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_2 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_3 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_4 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_5 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_6 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_7 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_8 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_9 == "V5811")|
                       (out_OC.ICD9_DGNS_CD_10 == "V5811")|
                       (out_OC.ICD9_PRCDR_CD_1== "9925")|
                       (out_OC.ICD9_PRCDR_CD_2== "9925")|
                       (out_OC.ICD9_PRCDR_CD_3== "9925")|
                       (out_OC.ICD9_PRCDR_CD_4== "9925")|
                       (out_OC.ICD9_PRCDR_CD_5== "9925")|
                       ((out_OC.HCPCS_CD_1.isin (cpt_lst))|out_OC.HCPCS_CD_1.startswith("J9"))|
                       ((out_OC.HCPCS_CD_2.isin (cpt_lst))|out_OC.HCPCS_CD_2.startswith("J9"))|
                       ((out_OC.HCPCS_CD_3.isin (cpt_lst))|out_OC.HCPCS_CD_3.startswith("J9"))|
                       ((out_OC.HCPCS_CD_4.isin (cpt_lst))|out_OC.HCPCS_CD_4.startswith("J9"))|
                       ((out_OC.HCPCS_CD_5.isin (cpt_lst))|out_OC.HCPCS_CD_5.startswith("J9"))|
                       ((out_OC.HCPCS_CD_6.isin (cpt_lst))|out_OC.HCPCS_CD_6.startswith("J9"))|
                       ((out_OC.HCPCS_CD_7.isin (cpt_lst))|out_OC.HCPCS_CD_7.startswith("J9"))|
                       ((out_OC.HCPCS_CD_8.isin (cpt_lst))|out_OC.HCPCS_CD_8.startswith("J9"))|
                       ((out_OC.HCPCS_CD_9.isin (cpt_lst))|out_OC.HCPCS_CD_9.startswith("J9"))|
                       ((out_OC.HCPCS_CD_10.isin (cpt_lst))|out_OC.HCPCS_CD_10.startswith("J9"))|
                       ((out_OC.HCPCS_CD_11.isin (cpt_lst))|out_OC.HCPCS_CD_11.startswith("J9"))|
                       ((out_OC.HCPCS_CD_12.isin (cpt_lst))|out_OC.HCPCS_CD_12.startswith("J9"))|
                       ((out_OC.HCPCS_CD_13.isin (cpt_lst))|out_OC.HCPCS_CD_13.startswith("J9"))|
                       ((out_OC.HCPCS_CD_14.isin (cpt_lst))|out_OC.HCPCS_CD_14.startswith("J9"))|
                       ((out_OC.HCPCS_CD_15.isin (cpt_lst))|out_OC.HCPCS_CD_15.startswith("J9"))|
                       ((out_OC.HCPCS_CD_16.isin (cpt_lst))|out_OC.HCPCS_CD_16.startswith("J9"))|
                       ((out_OC.HCPCS_CD_17.isin (cpt_lst))|out_OC.HCPCS_CD_17.startswith("J9"))|
                       ((out_OC.HCPCS_CD_18.isin (cpt_lst))|out_OC.HCPCS_CD_18.startswith("J9"))|
                       ((out_OC.HCPCS_CD_19.isin (cpt_lst))|out_OC.HCPCS_CD_19.startswith("J9"))|
                       ((out_OC.HCPCS_CD_20.isin (cpt_lst))|out_OC.HCPCS_CD_20.startswith("J9"))|
                       ((out_OC.HCPCS_CD_21.isin (cpt_lst))|out_OC.HCPCS_CD_21.startswith("J9"))|
                       ((out_OC.HCPCS_CD_22.isin (cpt_lst))|out_OC.HCPCS_CD_22.startswith("J9"))|
                       ((out_OC.HCPCS_CD_23.isin (cpt_lst))|out_OC.HCPCS_CD_23.startswith("J9"))|
                       ((out_OC.HCPCS_CD_24.isin (cpt_lst))|out_OC.HCPCS_CD_24.startswith("J9"))|
                       ((out_OC.HCPCS_CD_25.isin (cpt_lst))|out_OC.HCPCS_CD_25.startswith("J9"))|
                       ((out_OC.HCPCS_CD_26.isin (cpt_lst))|out_OC.HCPCS_CD_26.startswith("J9"))|
                       ((out_OC.HCPCS_CD_27.isin (cpt_lst))|out_OC.HCPCS_CD_27.startswith("J9"))|
                       ((out_OC.HCPCS_CD_28.isin (cpt_lst))|out_OC.HCPCS_CD_28.startswith("J9"))|
                       ((out_OC.HCPCS_CD_29.isin (cpt_lst))|out_OC.HCPCS_CD_29.startswith("J9"))|
                       ((out_OC.HCPCS_CD_30.isin (cpt_lst))|out_OC.HCPCS_CD_30.startswith("J9"))|
                       ((out_OC.HCPCS_CD_31.isin (cpt_lst))|out_OC.HCPCS_CD_31.startswith("J9"))|
                       ((out_OC.HCPCS_CD_32.isin (cpt_lst))|out_OC.HCPCS_CD_32.startswith("J9"))|
                       ((out_OC.HCPCS_CD_33.isin (cpt_lst))|out_OC.HCPCS_CD_33.startswith("J9"))|
                       ((out_OC.HCPCS_CD_34.isin (cpt_lst))|out_OC.HCPCS_CD_34.startswith("J9"))|
                       ((out_OC.HCPCS_CD_35.isin (cpt_lst))|out_OC.HCPCS_CD_35.startswith("J9"))|
                       ((out_OC.HCPCS_CD_36.isin (cpt_lst))|out_OC.HCPCS_CD_36.startswith("J9"))|
                       ((out_OC.HCPCS_CD_37.isin (cpt_lst))|out_OC.HCPCS_CD_37.startswith("J9"))|
                       ((out_OC.HCPCS_CD_38.isin (cpt_lst))|out_OC.HCPCS_CD_38.startswith("J9"))|
                       ((out_OC.HCPCS_CD_39.isin (cpt_lst))|out_OC.HCPCS_CD_39.startswith("J9"))|
                       ((out_OC.HCPCS_CD_40.isin (cpt_lst))|out_OC.HCPCS_CD_40.startswith("J9"))|
                       ((out_OC.HCPCS_CD_41.isin (cpt_lst))|out_OC.HCPCS_CD_41.startswith("J9"))|
                       ((out_OC.HCPCS_CD_42.isin (cpt_lst))|out_OC.HCPCS_CD_42.startswith("J9"))|
                       ((out_OC.HCPCS_CD_43.isin (cpt_lst))|out_OC.HCPCS_CD_43.startswith("J9"))|
                       ((out_OC.HCPCS_CD_44.isin (cpt_lst))|out_OC.HCPCS_CD_44.startswith("J9"))|
                       ((out_OC.HCPCS_CD_45.isin (cpt_lst))|out_OC.HCPCS_CD_45.startswith("J9"))
                       ),1).otherwise(0)))

In [80]:
OC_che.count()

2302

In [93]:
from pyspark.sql import dataframe
out_OC.cube('chem_class').count().show()

+----------+-----+
|chem_class|count|
+----------+-----+
|         1| 2302|
|      null|16706|
|         0|14404|
+----------+-----+



In [63]:
cpt_lst =list(range(96400,96451))+[96542, 96545,96549,90780,90784]

In [69]:
sample = out_OC.filter(out_OC.HCPCS_CD_1.isin (cpt_lst)|out_OC.HCPCS_CD_1.startswith("J9"))

In [70]:
s = sample.toPandas()

In [82]:
pd_OC = OC_che.toPandas()

In [83]:
pd.options.display.max_columns = None
pd_OC.head(5)

Unnamed: 0,DESYNPUF_ID,CLM_ID,PRVDR_NUM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,ICD9_DGNS_CD_8,ICD9_DGNS_CD_9,ICD9_DGNS_CD_10,ICD9_PRCDR_CD_1,ICD9_PRCDR_CD_2,ICD9_PRCDR_CD_3,ICD9_PRCDR_CD_4,ICD9_PRCDR_CD_5,ICD9_PRCDR_CD_6,ADMTNG_ICD9_DGNS_CD,HCPCS_CD_1,HCPCS_CD_2,HCPCS_CD_3,HCPCS_CD_4,HCPCS_CD_5,HCPCS_CD_6,HCPCS_CD_7,HCPCS_CD_8,HCPCS_CD_9,HCPCS_CD_10,HCPCS_CD_11,HCPCS_CD_12,HCPCS_CD_13,HCPCS_CD_14,HCPCS_CD_15,HCPCS_CD_16,HCPCS_CD_17,HCPCS_CD_18,HCPCS_CD_19,HCPCS_CD_20,HCPCS_CD_21,HCPCS_CD_22,HCPCS_CD_23,HCPCS_CD_24,HCPCS_CD_25,HCPCS_CD_26,HCPCS_CD_27,HCPCS_CD_28,HCPCS_CD_29,HCPCS_CD_30,HCPCS_CD_31,HCPCS_CD_32,HCPCS_CD_33,HCPCS_CD_34,HCPCS_CD_35,HCPCS_CD_36,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,class
0,42CDA84C361BDF03,38722212203867,2301GU,1830,V672,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72193,0,0,J1572,99285,93971,96415,96375,87040,96413,96375,J9091,96365,0,J7050,J9060,96413,96409,0,0,0,J2469,J1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,44DA5770803B6370,38602212179202,0503CM,1830,V5811,5939,5119,V5869,7295,0,0,0,0,0,0,0,0,0,0,0,80053,85651,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,486BE10685E7A377,38472212337149,1901AC,V5811,V4571,2853,V5881,1629,20280,1830,78702,28803,1978,0,0,0,0,0,0,0,0,0,85025,96361,J3010,G0393,84100,J7042,72220,70450,86900,84484,82565,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,519B21BB4C4DD253,38552211694642,4300BT,1830,78079,4539,1969,V5811,0,0,0,0,0,0,0,0,0,0,0,0,80053,81001,85610,82607,80053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,55C76CA11E982610,38992211693557,0100TU,1830,496,V5811,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36415,80048,82040,0,J3420,J1441,J3420,85610,99283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [97]:
out_chem = out_OC.filter(out_OC.chem_class ==1)
out_nochem = out_OC.filter(out_OC.chem_class == 0)

In [98]:
out_chem.count(),out_nochem.count()

(2302, 14404)

In [99]:
# Write the dataset to target path as parquet file.

path = "/Users/jill/Downloads/2020_Project/EDA/concat_data/outpatient/out_chemo"


In [100]:
out_chem.write.save(path+"/out_chem.parquet",format="parquet")

In [101]:
out_nochem.write.save(path+"/out_nochem.parquet",format="parquet")

In [38]:
# Merge product service ID

In [11]:
out_chem = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/outpatient/out_chemo/out_chem")

In [28]:
pde = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/pde/pde_con/")

In [12]:
pde.count(),out_chem.count()

(111085969, 2302)

In [106]:
type(out_chem)

pyspark.sql.dataframe.DataFrame

In [37]:
cols = ['DESYNPUF_ID' ,'PROD_SRVC_ID']
pde_med = pde.select(*cols)

In [51]:
pde_med.take(5)

[Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='53489037407'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='51129321702'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='00093104101'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='00405499501'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='00490713882')]

In [13]:
spark_merge = out_chem.join(pde_med, on = "DESYNPUF_ID",how='inner')

In [14]:
spark_merge.count()

126683

In [16]:
spark_merge.write.save("/Users/jill/Downloads/2020_Project/EDA/concat_data/outpatient/out_chemo/spark_merge",format="parquet")

In [17]:
df_merge = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/outpatient/out_chemo/spark_merge")

In [18]:
df = df_merge.toPandas()

In [20]:
df.head()

Unnamed: 0,DESYNPUF_ID,CLM_ID,PRVDR_NUM,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,ICD9_DGNS_CD_7,...,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,class,chem_class,PROD_SRVC_ID
0,A1F45395E9061019,790802246354606,52008K,V580,1830,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,16590022030
1,A1F45395E9061019,790802246354606,52008K,V580,1830,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,49884064101
2,A1F45395E9061019,790802246354606,52008K,V580,1830,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,58406050101
3,A22F9DCAA9E7112E,790832247019460,1100KR,1830,V5811,71596,7030,V5869,0,0,...,0,0,0,0,0,0,0,1,1,179159070
4,A22F9DCAA9E7112E,790832247019460,1100KR,1830,V5811,71596,7030,V5869,0,0,...,0,0,0,0,0,0,0,1,1,725002610


## 2. Inpatient with chemotherapy

In [7]:
in_OC = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/inpatient/inpatient_oc_feature/")

In [8]:
in_OC.count(), len(in_OC.columns)

(2649, 69)

In [9]:
in_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = true)
 |-- CLM_ID: long (nullable = true)
 |-- PRVDR_NUM: string (nullable = true)
 |-- CLM_ADMSN_DT: integer (nullable = true)
 |-- NCH_BENE_DSCHRG_DT: integer (nullable = true)
 |-- ADMTNG_ICD9_DGNS_CD: string (nullable = true)
 |-- CLM_DRG_CD: string (nullable = true)
 |-- ICD9_DGNS_CD_1: string (nullable = true)
 |-- ICD9_DGNS_CD_2: string (nullable = true)
 |-- ICD9_DGNS_CD_3: string (nullable = true)
 |-- ICD9_DGNS_CD_4: string (nullable = true)
 |-- ICD9_DGNS_CD_5: string (nullable = true)
 |-- ICD9_DGNS_CD_6: string (nullable = true)
 |-- ICD9_DGNS_CD_7: string (nullable = true)
 |-- ICD9_DGNS_CD_8: string (nullable = true)
 |-- ICD9_DGNS_CD_9: string (nullable = true)
 |-- ICD9_DGNS_CD_10: string (nullable = true)
 |-- ICD9_PRCDR_CD_1: integer (nullable = true)
 |-- ICD9_PRCDR_CD_2: string (nullable = true)
 |-- ICD9_PRCDR_CD_3: string (nullable = true)
 |-- ICD9_PRCDR_CD_4: string (nullable = true)
 |-- ICD9_PRCDR_CD_5: string (nullab

In [10]:
in_OC = in_OC.withColumn("ICD9_PRCDR_CD_1", in_OC["ICD9_PRCDR_CD_1"].cast(StringType()))

In [11]:
in_OC = in_OC.fillna("0")

In [12]:
in_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = false)
 |-- CLM_ID: long (nullable = true)
 |-- PRVDR_NUM: string (nullable = false)
 |-- CLM_ADMSN_DT: integer (nullable = true)
 |-- NCH_BENE_DSCHRG_DT: integer (nullable = true)
 |-- ADMTNG_ICD9_DGNS_CD: string (nullable = false)
 |-- CLM_DRG_CD: string (nullable = false)
 |-- ICD9_DGNS_CD_1: string (nullable = false)
 |-- ICD9_DGNS_CD_2: string (nullable = false)
 |-- ICD9_DGNS_CD_3: string (nullable = false)
 |-- ICD9_DGNS_CD_4: string (nullable = false)
 |-- ICD9_DGNS_CD_5: string (nullable = false)
 |-- ICD9_DGNS_CD_6: string (nullable = false)
 |-- ICD9_DGNS_CD_7: string (nullable = false)
 |-- ICD9_DGNS_CD_8: string (nullable = false)
 |-- ICD9_DGNS_CD_9: string (nullable = false)
 |-- ICD9_DGNS_CD_10: string (nullable = false)
 |-- ICD9_PRCDR_CD_1: string (nullable = false)
 |-- ICD9_PRCDR_CD_2: string (nullable = false)
 |-- ICD9_PRCDR_CD_3: string (nullable = false)
 |-- ICD9_PRCDR_CD_4: string (nullable = false)
 |-- ICD9_PRCDR_CD_

In [69]:
in_OC = in_OC.withColumn('chem_class',(F.when(((in_OC.ICD9_DGNS_CD_1 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_2 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_3 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_4 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_5 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_6 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_7 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_8 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_9 == "V5811")|
                       (in_OC.ICD9_DGNS_CD_10 == "V5811")|
                       (in_OC.ICD9_PRCDR_CD_1== "9925")|
                       (in_OC.ICD9_PRCDR_CD_2== "9925")|
                       (in_OC.ICD9_PRCDR_CD_3== "9925")|
                       (in_OC.ICD9_PRCDR_CD_4== "9925")|
                       (in_OC.ICD9_PRCDR_CD_5== "9925")|
                       ((in_OC.HCPCS_CD_1.isin (cpt_lst))|in_OC.HCPCS_CD_1.startswith("J9"))|
                       ((in_OC.HCPCS_CD_2.isin (cpt_lst))|in_OC.HCPCS_CD_2.startswith("J9"))|
                       ((in_OC.HCPCS_CD_3.isin (cpt_lst))|in_OC.HCPCS_CD_3.startswith("J9"))|
                       ((in_OC.HCPCS_CD_4.isin (cpt_lst))|in_OC.HCPCS_CD_4.startswith("J9"))|
                       ((in_OC.HCPCS_CD_5.isin (cpt_lst))|in_OC.HCPCS_CD_5.startswith("J9"))|
                       ((in_OC.HCPCS_CD_6.isin (cpt_lst))|in_OC.HCPCS_CD_6.startswith("J9"))|
                       ((in_OC.HCPCS_CD_7.isin (cpt_lst))|in_OC.HCPCS_CD_7.startswith("J9"))|
                       ((in_OC.HCPCS_CD_8.isin (cpt_lst))|in_OC.HCPCS_CD_8.startswith("J9"))|
                       ((in_OC.HCPCS_CD_9.isin (cpt_lst))|in_OC.HCPCS_CD_9.startswith("J9"))|
                       ((in_OC.HCPCS_CD_10.isin (cpt_lst))|in_OC.HCPCS_CD_10.startswith("J9"))|
                       ((in_OC.HCPCS_CD_11.isin (cpt_lst))|in_OC.HCPCS_CD_11.startswith("J9"))|
                       ((in_OC.HCPCS_CD_12.isin (cpt_lst))|in_OC.HCPCS_CD_12.startswith("J9"))|
                       ((in_OC.HCPCS_CD_13.isin (cpt_lst))|in_OC.HCPCS_CD_13.startswith("J9"))|
                       ((in_OC.HCPCS_CD_14.isin (cpt_lst))|in_OC.HCPCS_CD_14.startswith("J9"))|
                       ((in_OC.HCPCS_CD_15.isin (cpt_lst))|in_OC.HCPCS_CD_15.startswith("J9"))|
                       ((in_OC.HCPCS_CD_16.isin (cpt_lst))|in_OC.HCPCS_CD_16.startswith("J9"))|
                       ((in_OC.HCPCS_CD_17.isin (cpt_lst))|in_OC.HCPCS_CD_17.startswith("J9"))|
                       ((in_OC.HCPCS_CD_18.isin (cpt_lst))|in_OC.HCPCS_CD_18.startswith("J9"))|
                       ((in_OC.HCPCS_CD_19.isin (cpt_lst))|in_OC.HCPCS_CD_19.startswith("J9"))|
                       ((in_OC.HCPCS_CD_20.isin (cpt_lst))|in_OC.HCPCS_CD_20.startswith("J9"))|
                       ((in_OC.HCPCS_CD_21.isin (cpt_lst))|in_OC.HCPCS_CD_21.startswith("J9"))|
                       ((in_OC.HCPCS_CD_22.isin (cpt_lst))|in_OC.HCPCS_CD_22.startswith("J9"))|
                       ((in_OC.HCPCS_CD_23.isin (cpt_lst))|in_OC.HCPCS_CD_23.startswith("J9"))|
                       ((in_OC.HCPCS_CD_24.isin (cpt_lst))|in_OC.HCPCS_CD_24.startswith("J9"))|
                       ((in_OC.HCPCS_CD_25.isin (cpt_lst))|in_OC.HCPCS_CD_25.startswith("J9"))|
                       ((in_OC.HCPCS_CD_26.isin (cpt_lst))|in_OC.HCPCS_CD_26.startswith("J9"))|
                       ((in_OC.HCPCS_CD_27.isin (cpt_lst))|in_OC.HCPCS_CD_27.startswith("J9"))|
                       ((in_OC.HCPCS_CD_28.isin (cpt_lst))|in_OC.HCPCS_CD_28.startswith("J9"))|
                       ((in_OC.HCPCS_CD_29.isin (cpt_lst))|in_OC.HCPCS_CD_29.startswith("J9"))|
                       ((in_OC.HCPCS_CD_30.isin (cpt_lst))|in_OC.HCPCS_CD_30.startswith("J9"))|
                       ((in_OC.HCPCS_CD_31.isin (cpt_lst))|in_OC.HCPCS_CD_31.startswith("J9"))|
                       ((in_OC.HCPCS_CD_32.isin (cpt_lst))|in_OC.HCPCS_CD_32.startswith("J9"))|
                       ((in_OC.HCPCS_CD_33.isin (cpt_lst))|in_OC.HCPCS_CD_33.startswith("J9"))|
                       ((in_OC.HCPCS_CD_34.isin (cpt_lst))|in_OC.HCPCS_CD_34.startswith("J9"))|
                       ((in_OC.HCPCS_CD_35.isin (cpt_lst))|in_OC.HCPCS_CD_35.startswith("J9"))|
                       ((in_OC.HCPCS_CD_36.isin (cpt_lst))|in_OC.HCPCS_CD_36.startswith("J9"))|
                       ((in_OC.HCPCS_CD_37.isin (cpt_lst))|in_OC.HCPCS_CD_37.startswith("J9"))|
                       ((in_OC.HCPCS_CD_38.isin (cpt_lst))|in_OC.HCPCS_CD_38.startswith("J9"))|
                       ((in_OC.HCPCS_CD_39.isin (cpt_lst))|in_OC.HCPCS_CD_39.startswith("J9"))|
                       ((in_OC.HCPCS_CD_40.isin (cpt_lst))|in_OC.HCPCS_CD_40.startswith("J9"))|
                       ((in_OC.HCPCS_CD_41.isin (cpt_lst))|in_OC.HCPCS_CD_41.startswith("J9"))|
                       ((in_OC.HCPCS_CD_42.isin (cpt_lst))|in_OC.HCPCS_CD_42.startswith("J9"))|
                       ((in_OC.HCPCS_CD_43.isin (cpt_lst))|in_OC.HCPCS_CD_43.startswith("J9"))|
                       ((in_OC.HCPCS_CD_44.isin (cpt_lst))|in_OC.HCPCS_CD_44.startswith("J9"))|
                       ((in_OC.HCPCS_CD_45.isin (cpt_lst))|in_OC.HCPCS_CD_45.startswith("J9"))
                       ),1).otherwise(0)))

In [95]:
in_OC.cube('chem_class').count().show()

+----------+-----+
|chem_class|count|
+----------+-----+
|         1|  107|
|      null| 2649|
|         0| 2542|
+----------+-----+



In [16]:
in_chem = in_OC.filter(in_OC.chem_class == 1)

In [23]:
in_nochem = in_OC.filter(in_OC.chem_class == 0)

In [24]:
in_chem.count(),in_nochem.count()

(107, 2542)

In [38]:
# Join with PDE table
pde_med.take(3)

[Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='53489037407'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='51129321702'),
 Row(DESYNPUF_ID='A3A6E99E44151E35', PROD_SRVC_ID='00093104101')]

In [32]:
in_chem = spark.read.load(in_path+"/in_chem")

In [39]:
in_chem.head()

Row(DESYNPUF_ID='4810BA54FD314AED', CLM_ID=790421146295283, PRVDR_NUM='0500VS', CLM_ADMSN_DT=20100501, NCH_BENE_DSCHRG_DT=20100503, ADMTNG_ICD9_DGNS_CD='1550', CLM_DRG_CD='837', ICD9_DGNS_CD_1='V5811', ICD9_DGNS_CD_2='53085', ICD9_DGNS_CD_3='1830', ICD9_DGNS_CD_4='79029', ICD9_DGNS_CD_5='30000', ICD9_DGNS_CD_6='0', ICD9_DGNS_CD_7='0', ICD9_DGNS_CD_8='0', ICD9_DGNS_CD_9='0', ICD9_DGNS_CD_10='0', ICD9_PRCDR_CD_1='9925', ICD9_PRCDR_CD_2='0', ICD9_PRCDR_CD_3='0', ICD9_PRCDR_CD_4='0', ICD9_PRCDR_CD_5='0', ICD9_PRCDR_CD_6='0', HCPCS_CD_1='0', HCPCS_CD_2='0', HCPCS_CD_3='0', HCPCS_CD_4='0', HCPCS_CD_5='0', HCPCS_CD_6='0', HCPCS_CD_7='0', HCPCS_CD_8='0', HCPCS_CD_9='0', HCPCS_CD_10='0', HCPCS_CD_11='0', HCPCS_CD_12='0', HCPCS_CD_13='0', HCPCS_CD_14='0', HCPCS_CD_15='0', HCPCS_CD_16='0', HCPCS_CD_17='0', HCPCS_CD_18='0', HCPCS_CD_19='0', HCPCS_CD_20='0', HCPCS_CD_21='0', HCPCS_CD_22='0', HCPCS_CD_23='0', HCPCS_CD_24='0', HCPCS_CD_25='0', HCPCS_CD_26='0', HCPCS_CD_27='0', HCPCS_CD_28='0', HCPCS_

In [40]:
spark_in_merge = in_chem.join(pde_med, on = "DESYNPUF_ID", how='inner')

In [41]:
spark_in_merge.head()

Row(DESYNPUF_ID='C49DE573497A0F0A', CLM_ID=45721150063072, PRVDR_NUM='3000GA', CLM_ADMSN_DT=20090527, NCH_BENE_DSCHRG_DT=20090531, ADMTNG_ICD9_DGNS_CD='V5811', CLM_DRG_CD='838', ICD9_DGNS_CD_1='V5811', ICD9_DGNS_CD_2='1461', ICD9_DGNS_CD_3='53081', ICD9_DGNS_CD_4='6954', ICD9_DGNS_CD_5='1966', ICD9_DGNS_CD_6='20410', ICD9_DGNS_CD_7='1830', ICD9_DGNS_CD_8='2720', ICD9_DGNS_CD_9='5715', ICD9_DGNS_CD_10='0', ICD9_PRCDR_CD_1='9925', ICD9_PRCDR_CD_2='1533', ICD9_PRCDR_CD_3='0', ICD9_PRCDR_CD_4='0', ICD9_PRCDR_CD_5='0', ICD9_PRCDR_CD_6='0', HCPCS_CD_1='0', HCPCS_CD_2='0', HCPCS_CD_3='0', HCPCS_CD_4='0', HCPCS_CD_5='0', HCPCS_CD_6='0', HCPCS_CD_7='0', HCPCS_CD_8='0', HCPCS_CD_9='0', HCPCS_CD_10='0', HCPCS_CD_11='0', HCPCS_CD_12='0', HCPCS_CD_13='0', HCPCS_CD_14='0', HCPCS_CD_15='0', HCPCS_CD_16='0', HCPCS_CD_17='0', HCPCS_CD_18='0', HCPCS_CD_19='0', HCPCS_CD_20='0', HCPCS_CD_21='0', HCPCS_CD_22='0', HCPCS_CD_23='0', HCPCS_CD_24='0', HCPCS_CD_25='0', HCPCS_CD_26='0', HCPCS_CD_27='0', HCPCS_CD_

In [42]:
spark_in_merge.count()

6649

### Write the dataset as parquet files

In [25]:
in_path = "/Users/jill/Downloads/2020_Project/EDA/concat_data/inpatient/inpatient_chemo/"

In [26]:
in_chem.write.save(in_path+"/in_chem", format = "parquet")

In [27]:
in_nochem.write.save(in_path+"/in_nochem", format = "parquet")

In [43]:
spark_in_merge.write.save(in_path+"/in_merge_pde", format = "parquet")

## 3. Carrier with chemotherapy

In [99]:
carrier_OC = spark.read.load("/Users/jill/Downloads/2020_Project/EDA/concat_data/carrier/carrier_oc_feature/")

In [100]:
carrier_OC.count(), len(carrier_OC.columns)

(32239, 23)

In [101]:
carrier_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = true)
 |-- CLM_ID: long (nullable = true)
 |-- ICD9_DGNS_CD_1: string (nullable = true)
 |-- ICD9_DGNS_CD_2: string (nullable = true)
 |-- ICD9_DGNS_CD_3: string (nullable = true)
 |-- ICD9_DGNS_CD_4: string (nullable = true)
 |-- ICD9_DGNS_CD_5: string (nullable = true)
 |-- ICD9_DGNS_CD_6: string (nullable = true)
 |-- ICD9_DGNS_CD_7: string (nullable = true)
 |-- ICD9_DGNS_CD_8: string (nullable = true)
 |-- HCPCS_CD_1: string (nullable = true)
 |-- HCPCS_CD_2: string (nullable = true)
 |-- HCPCS_CD_3: string (nullable = true)
 |-- HCPCS_CD_4: string (nullable = true)
 |-- HCPCS_CD_5: string (nullable = true)
 |-- HCPCS_CD_6: string (nullable = true)
 |-- HCPCS_CD_7: string (nullable = true)
 |-- HCPCS_CD_8: string (nullable = true)
 |-- HCPCS_CD_9: string (nullable = true)
 |-- HCPCS_CD_10: string (nullable = true)
 |-- HCPCS_CD_11: string (nullable = true)
 |-- HCPCS_CD_12: string (nullable = true)
 |-- HCPCS_CD_13: string (nullable = true

In [102]:
carrier_OC = carrier_OC.fillna("0")

In [103]:
carrier_OC.printSchema()

root
 |-- DESYNPUF_ID: string (nullable = false)
 |-- CLM_ID: long (nullable = true)
 |-- ICD9_DGNS_CD_1: string (nullable = false)
 |-- ICD9_DGNS_CD_2: string (nullable = false)
 |-- ICD9_DGNS_CD_3: string (nullable = false)
 |-- ICD9_DGNS_CD_4: string (nullable = false)
 |-- ICD9_DGNS_CD_5: string (nullable = false)
 |-- ICD9_DGNS_CD_6: string (nullable = false)
 |-- ICD9_DGNS_CD_7: string (nullable = false)
 |-- ICD9_DGNS_CD_8: string (nullable = false)
 |-- HCPCS_CD_1: string (nullable = false)
 |-- HCPCS_CD_2: string (nullable = false)
 |-- HCPCS_CD_3: string (nullable = false)
 |-- HCPCS_CD_4: string (nullable = false)
 |-- HCPCS_CD_5: string (nullable = false)
 |-- HCPCS_CD_6: string (nullable = false)
 |-- HCPCS_CD_7: string (nullable = false)
 |-- HCPCS_CD_8: string (nullable = false)
 |-- HCPCS_CD_9: string (nullable = false)
 |-- HCPCS_CD_10: string (nullable = false)
 |-- HCPCS_CD_11: string (nullable = false)
 |-- HCPCS_CD_12: string (nullable = false)
 |-- HCPCS_CD_13: st

In [104]:
cpt_lst = list(map(str,cpt_lst))

In [105]:
print (cpt_str)

['96400', '96401', '96402', '96403', '96404', '96405', '96406', '96407', '96408', '96409', '96410', '96411', '96412', '96413', '96414', '96415', '96416', '96417', '96418', '96419', '96420', '96421', '96422', '96423', '96424', '96425', '96426', '96427', '96428', '96429', '96430', '96431', '96432', '96433', '96434', '96435', '96436', '96437', '96438', '96439', '96440', '96441', '96442', '96443', '96444', '96445', '96446', '96447', '96448', '96449', '96450', '96542', '96545', '96549', '90780', '90784']


In [106]:
carrier_OC = carrier_OC.withColumn('chem_class', (F.when(((carrier_OC.ICD9_DGNS_CD_1 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_2 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_3 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_4 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_5 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_6 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_7 == "V5811")|
                       (carrier_OC.ICD9_DGNS_CD_8 == "V5811")|
                       ((carrier_OC.HCPCS_CD_1.isin (cpt_lst))|carrier_OC.HCPCS_CD_1.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_2.isin (cpt_lst))|carrier_OC.HCPCS_CD_2.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_3.isin (cpt_lst))|carrier_OC.HCPCS_CD_3.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_4.isin (cpt_lst))|carrier_OC.HCPCS_CD_4.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_5.isin (cpt_lst))|carrier_OC.HCPCS_CD_5.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_6.isin (cpt_lst))|carrier_OC.HCPCS_CD_6.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_7.isin (cpt_lst))|carrier_OC.HCPCS_CD_7.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_8.isin (cpt_lst))|carrier_OC.HCPCS_CD_8.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_9.isin (cpt_lst))|carrier_OC.HCPCS_CD_9.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_10.isin (cpt_lst))|carrier_OC.HCPCS_CD_10.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_11.isin (cpt_lst))|carrier_OC.HCPCS_CD_11.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_12.isin (cpt_lst))|carrier_OC.HCPCS_CD_12.startswith("J9"))|
                       ((carrier_OC.HCPCS_CD_13.isin (cpt_lst))|carrier_OC.HCPCS_CD_13.startswith("J9"))
                       ),1).otherwise(0)))

In [107]:
carrier_OC.cube('chem_class').count().show()

+----------+-----+
|chem_class|count|
+----------+-----+
|         1| 6524|
|      null|32239|
|         0|25715|
+----------+-----+



In [108]:
carrier_chem = carrier_OC.filter(carrier_OC.chem_class == 1)

In [109]:
carrier_nochem = carrier_OC.filter(carrier_OC.chem_class == 0)

In [110]:
carrier_chem.count(), carrier_nochem.count()

(6524, 25715)

In [118]:
carrier_chem = spark.read.load(carrier_path+"/carrier_chem/")

In [119]:
carrier_merge_pde = carrier_chem.join(pde_med, on = "DESYNPUF_ID", how = "inner")

In [120]:
carrier_merge_pde.count()

321472

### write the data as parquet files

In [114]:
carrier_path = "/Users/jill/Downloads/2020_Project/EDA/concat_data/carrier/carrier_chemo/"

In [115]:
carrier_chem.write.save(carrier_path+"/carrier_chem", format ='parquet')

In [116]:
carrier_nochem.write.save(carrier_path+"/carrier_nochem/",format = 'parquet')

In [121]:
carrier_merge_pde.write.save(carrier_path+"/carrier_pde_merge/",format = 'parquet')