In [1]:
import findspark
findspark.init('E:\spark\spark-3.3.1-bin-hadoop2/')
import pyspark
import seaborn as sns
import pandas as pd

from pyspark.sql import SparkSession
from pyspark import SparkContext

mongo_ip = 'mongodb://localhost:27017/Cleaned_Insurance'

spark = SparkSession \
.builder\
.appName("myApp")\
.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
.getOrCreate()

In [2]:
inp = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",mongo_ip + ".Inpatient").load()
inp.createOrReplaceTempView('inp')

In [3]:
out = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",mongo_ip + ".Outpatient").load()
out.createOrReplaceTempView('out')

In [4]:
inp.columns

['Beneficiary_Blood_Deductible_Liability_Amount',
 'Beneficiary_Inpatient_Deductible_Amount',
 'Beneficiary_PartA_Coinsurance_Liability_Amount',
 'Claim_Admission_Date',
 'Claim_Admitting_Diagnosis_Code',
 'Claim_End_Date',
 'Claim_ID',
 'Claim_Pass_Thru_Per_Diem_Amount',
 'Claim_Payment_Amount',
 'Claim_Start_Date',
 'Claim_Utilization_Day_Count',
 'DESYNPUF_ID',
 'Inpatient_discharged_date',
 'Primary_Payer_Claim_Paid_Amount',
 'Provider_Number',
 '_id']

## TOP 10 MAX CLAIMS BY BENEF ID

In [8]:
a = spark.sql("select DESYNPUF_ID,sum(Claim_Payment_Amount) as Total_amount_claimed from inp group by DESYNPUF_ID order by Total_amount_claimed desc limit 10")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Inpatient/Top_Max_Claims_By_BenefID.csv',  index=False)

In [14]:
a = spark.sql("select DESYNPUF_ID,sum(Claim_Payment_Amount) as Total_amount_claimed from out group by DESYNPUF_ID order by Total_amount_claimed desc limit 10")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Outpatient/T_Top_Max_Claims_By_BenefID.csv',  index=False)

## NUMBER OF CLAIMS IN EACH YEAR

In [19]:
from pyspark.sql.functions import year
from pyspark.sql.functions import to_date
 
inp = inp.withColumn('Year',year(inp.Claim_Start_Date))
inp.createOrReplaceTempView('inp')

In [23]:
a = spark.sql("select Year,count(Claim_ID) from inp group by Year")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Inpatient/NUMBER_OF_CLAIMS_IN_EACH_YEAR.csv',  index=False)

In [27]:
a = spark.sql("select Year,count(CLM_ID) from out group by Year")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Outpatient/NUMBER_OF_CLAIMS_IN_EACH_YEAR.csv',  index=False)

## Average Claim Amount

In [29]:
a = spark.sql("select avg(Claim_Payment_Amount) as Average_Claim from inp")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Inpatient/Average_Claim_Amount.csv',  index=False)

In [30]:
a = spark.sql("select avg(Claim_Payment_Amount) as Average_Claim from out")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Outpatient/Average_Claim_Amount.csv',  index=False)

## TOP 5 Providers With Maximum Claims

In [33]:
a = spark.sql("select Provider_Number,sum(Claim_Payment_Amount) as Total_Claim_Amount from inp group by Provider_Number order by Total_Claim_Amount desc limit 5")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Inpatient/TOP_5_Providers_With_Maximum_Claims.csv',  index=False)

In [34]:
a = spark.sql("select Provider_Number,sum(Claim_Payment_Amount) as Total_Claim_Amount from out group by Provider_Number order by Total_Claim_Amount desc limit 5")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Outpatient/TOP_5_Providers_With_Maximum_Claims.csv',  index=False)

## Total number of In and Out people taken claims 

In [42]:
a = spark.sql("select count(DESYNPUF_ID) as Inp_Total,(select count(DESYNPUF_ID) as Out_Total from out) as Out_Total from inp")

In [44]:
a_p = a.toPandas()

In [45]:
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Combined/Count_of_people_taken_claims_IN_OUT.csv',  index=False)

## Year wise total claims In Out

In [49]:
a = spark.sql("select Year,count(Claim_Payment_Amount) as Total_Claims from inp group by Year")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Inpatient/Year_wise_total_claims.csv',  index=False)

In [50]:
a = spark.sql("select Year,count(Claim_Payment_Amount) as Total_Claims from out group by Year")
a_p = a.toPandas()
a_p.to_csv('C:/Users/ASUS/Desktop/Insurance/Queries/Outpatient/Year_wise_total_claims.csv',  index=False)