In [1]:
import findspark
findspark.init('D:\spark\spark-3.3.2-bin-hadoop2')

In [2]:
import pyspark
pyspark.__version__

'3.3.2'

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [4]:
mongo_ip = 'mongodb://localhost:27017/Insurance'

In [5]:
spark = SparkSession \
.builder\
.appName("myApp1")\
.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
.getOrCreate()

In [6]:
outp = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",mongo_ip + ".Outpatient").load()

In [7]:
drop_col = ['NCH_BENE_PTB_COINSRNC_AMT','_id','OP_PHYSN_NPI','OT_PHYSN_NPI','SEGMENT','AT_PHYSN_NPI','HCPCS_CD_1','HCPCS_CD_10','HCPCS_CD_11','HCPCS_CD_12','HCPCS_CD_13','HCPCS_CD_14','HCPCS_CD_15','HCPCS_CD_16','HCPCS_CD_17','HCPCS_CD_18','HCPCS_CD_19','HCPCS_CD_2','HCPCS_CD_20','HCPCS_CD_21','HCPCS_CD_22','HCPCS_CD_23','HCPCS_CD_24','HCPCS_CD_25','HCPCS_CD_26','HCPCS_CD_27','HCPCS_CD_28','HCPCS_CD_29','HCPCS_CD_3','HCPCS_CD_30','HCPCS_CD_31','HCPCS_CD_32','HCPCS_CD_33','HCPCS_CD_34','HCPCS_CD_35','HCPCS_CD_36','HCPCS_CD_37','HCPCS_CD_38','HCPCS_CD_39','HCPCS_CD_4','HCPCS_CD_40','HCPCS_CD_41','HCPCS_CD_42','HCPCS_CD_43','HCPCS_CD_44','HCPCS_CD_45','HCPCS_CD_5','HCPCS_CD_6','HCPCS_CD_7','HCPCS_CD_8','HCPCS_CD_9','ICD9_DGNS_CD_1','ICD9_DGNS_CD_10','ICD9_DGNS_CD_2','ICD9_DGNS_CD_3','ICD9_DGNS_CD_4','ICD9_DGNS_CD_5','ICD9_DGNS_CD_6','ICD9_DGNS_CD_7','ICD9_DGNS_CD_8','ICD9_DGNS_CD_9','ICD9_PRCDR_CD_1','ICD9_PRCDR_CD_2','ICD9_PRCDR_CD_3','ICD9_PRCDR_CD_4','ICD9_PRCDR_CD_5','ICD9_PRCDR_CD_6']

In [8]:
outp = outp.drop(*drop_col)

In [9]:
outp = outp.withColumnRenamed("PRVDR_NUM","Provider_Number") \
    .withColumnRenamed("CLM_PMT_AMT","Claim_Payment_Amount")\
.withColumnRenamed("CLM_PMT_AMT","Claim_Payment_Amount")\
.withColumnRenamed("NCH_PRMRY_PYR_CLM_PD_AMT","Primary_Payer_Claim_Paid_Amount")\
.withColumnRenamed("NCH_BENE_BLOOD_DDCTBL_LBLTY_AM","Beneficiary_Blood_Deductible_Liability_Amount")\
.withColumnRenamed("NCH_BENE_PTB_DDCTBL_AMT","Beneficiary_PartB_Deductible_Amount")\
.withColumnRenamed("ADMTNG_ICD9_DGNS_CD","Claim_Admitting_Diagnosis_Code")\
.withColumnRenamed("CLM_FROM_DT","Claim_Start_Date")\
.withColumnRenamed("CLM_THRU_DT","Claim_End_Date")

In [10]:
outp = outp.withColumn("Claim_Payment_Amount", 
                                  outp["Claim_Payment_Amount"]
                                  .cast('int'))\
.withColumn("Beneficiary_Blood_Deductible_Liability_Amount", 
                                  outp["Beneficiary_Blood_Deductible_Liability_Amount"]
                                  .cast('int'))\
.withColumn("Beneficiary_PartB_Deductible_Amount", 
                                  outp["Beneficiary_PartB_Deductible_Amount"]
                                  .cast('int'))\
.withColumn("Primary_Payer_Claim_Paid_Amount", 
                                  outp["Primary_Payer_Claim_Paid_Amount"]
                                  .cast('int'))

In [11]:
from pyspark.sql.functions import when

outp = outp.withColumn("Claim_Start_Date", when(outp["Claim_Start_Date"] == '', None).otherwise(outp["Claim_Start_Date"]))

In [12]:
from pyspark.sql.functions import col,isnan,when,count
outp.select([count(when(col(c).isNull(), c)).alias(c) for c in outp.columns]).show()

+------------------------------+----------------+------+--------------------+--------------+-----------+---------------------------------------------+-----------------------------------+-------------------------------+---------------+
|Claim_Admitting_Diagnosis_Code|Claim_Start_Date|CLM_ID|Claim_Payment_Amount|Claim_End_Date|DESYNPUF_ID|Beneficiary_Blood_Deductible_Liability_Amount|Beneficiary_PartB_Deductible_Amount|Primary_Payer_Claim_Paid_Amount|Provider_Number|
+------------------------------+----------------+------+--------------------+--------------+-----------+---------------------------------------------+-----------------------------------+-------------------------------+---------------+
|                        596304|           33628|     0|                   0|         11253|          0|                                            0|                                  0|                              0|              0|
+------------------------------+----------------+------+----

In [13]:
outp = outp.dropna(how="any")

In [14]:
from pyspark.sql.functions import col,isnan,when,count
outp.select([count(when(col(c).isNull(), c)).alias(c) for c in outp.columns]).show()

+------------------------------+----------------+------+--------------------+--------------+-----------+---------------------------------------------+-----------------------------------+-------------------------------+---------------+
|Claim_Admitting_Diagnosis_Code|Claim_Start_Date|CLM_ID|Claim_Payment_Amount|Claim_End_Date|DESYNPUF_ID|Beneficiary_Blood_Deductible_Liability_Amount|Beneficiary_PartB_Deductible_Amount|Primary_Payer_Claim_Paid_Amount|Provider_Number|
+------------------------------+----------------+------+--------------------+--------------+-----------+---------------------------------------------+-----------------------------------+-------------------------------+---------------+
|                             0|               0|     0|                   0|             0|          0|                                            0|                                  0|                              0|              0|
+------------------------------+----------------+------+----

In [15]:
outp.columns

['Claim_Admitting_Diagnosis_Code',
 'Claim_Start_Date',
 'CLM_ID',
 'Claim_Payment_Amount',
 'Claim_End_Date',
 'DESYNPUF_ID',
 'Beneficiary_Blood_Deductible_Liability_Amount',
 'Beneficiary_PartB_Deductible_Amount',
 'Primary_Payer_Claim_Paid_Amount',
 'Provider_Number']

In [16]:
from datetime import datetime
from pyspark.sql.functions import col,udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import date_format

In [17]:
func =  udf (lambda x: datetime.strptime(x, '%Y%m%d'), DateType())
outp = outp.withColumn('Claim_Start_Date', date_format(func(col('Claim_Start_Date')), 'yyyy-MM-dd'))

In [18]:
from pyspark.sql.functions import to_date
outp = outp.withColumn("Claim_Start_Date", to_date("Claim_Start_Date", "yyyy-MM-dd"))

In [19]:
func =  udf (lambda x: datetime.strptime(x, '%Y%m%d'), DateType())
outp = outp.withColumn('Claim_End_Date', date_format(func(col('Claim_End_Date')), 'yyyy-MM-dd'))

In [20]:
outp = outp.withColumn("Claim_End_Date", to_date("Claim_End_Date", "yyyy-MM-dd"))

In [21]:
outp.dtypes

[('Claim_Admitting_Diagnosis_Code', 'string'),
 ('Claim_Start_Date', 'date'),
 ('CLM_ID', 'string'),
 ('Claim_Payment_Amount', 'int'),
 ('Claim_End_Date', 'date'),
 ('DESYNPUF_ID', 'string'),
 ('Beneficiary_Blood_Deductible_Liability_Amount', 'int'),
 ('Beneficiary_PartB_Deductible_Amount', 'int'),
 ('Primary_Payer_Claim_Paid_Amount', 'int'),
 ('Provider_Number', 'string')]

In [22]:
from pyspark.sql.functions import year
outp = outp.withColumn('Year',year(outp.Claim_End_Date))

In [23]:
from pyspark.sql.functions import month
outp = outp.withColumn('Month',month(outp.Claim_End_Date))

In [24]:
outp.dtypes

[('Claim_Admitting_Diagnosis_Code', 'string'),
 ('Claim_Start_Date', 'date'),
 ('CLM_ID', 'string'),
 ('Claim_Payment_Amount', 'int'),
 ('Claim_End_Date', 'date'),
 ('DESYNPUF_ID', 'string'),
 ('Beneficiary_Blood_Deductible_Liability_Amount', 'int'),
 ('Beneficiary_PartB_Deductible_Amount', 'int'),
 ('Primary_Payer_Claim_Paid_Amount', 'int'),
 ('Provider_Number', 'string'),
 ('Year', 'int'),
 ('Month', 'int')]

In [25]:
vis=['Claim_Start_Date','Claim_End_Date']
outp = outp.drop(*vis)

In [None]:
outp.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite").option("spark.mongodb.output.uri","mongodb://localhost:27017/Cleaned_Insurance.Outpatient").save()