In [0]:
# Load dataset from Databricks
claims_df = spark.read.table("workspace.default.insurance_claim_data")

# Display first few rows
display(claims_df)

# Print schema
claims_df.printSchema()

# Register as temporary SQL view for SQL-based queries
claims_df.createOrReplaceTempView("insurance_claims")


Claim_ID,Policy_ID,Age,Gender,Region,Claim_Date,Claim_Amount,Claim_Status,Claim_Type
C00001,P00001,56,Male,Central,2024-02-14,162423.72,Approved,Health
C00002,P00002,48,Male,North,2023-06-26,174379.8,Pending,Health
C00003,P00003,41,Female,Central,2024-03-01,56381.97,Approved,Auto
C00004,P00004,64,Female,North,2023-05-22,143472.46,Rejected,Life
C00005,P00005,63,Female,South,2023-01-07,117065.7,Rejected,Property
C00006,P00006,22,Female,South,2023-10-14,94511.86,Pending,Auto
C00007,P00007,73,Male,Central,2023-12-03,12430.03,Approved,Life
C00008,P00008,40,Male,Central,2023-10-31,40829.5,Approved,Property
C00009,P00009,50,Male,South,2024-06-02,91400.42,Approved,Health
C00010,P00010,46,Female,Central,2023-05-05,49767.15,Approved,Life


root
 |-- Claim_ID: string (nullable = true)
 |-- Policy_ID: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Claim_Date: date (nullable = true)
 |-- Claim_Amount: double (nullable = true)
 |-- Claim_Status: string (nullable = true)
 |-- Claim_Type: string (nullable = true)



In [0]:
print(f"Total Records: {claims_df.count()}")
print(f"Total Columns: {len(claims_df.columns)}")


Total Records: 5000
Total Columns: 9


In [0]:
from pyspark.sql.functions import col, sum

null_counts = claims_df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in claims_df.columns
])
display(null_counts)


Claim_ID,Policy_ID,Age,Gender,Region,Claim_Date,Claim_Amount,Claim_Status,Claim_Type
0,0,0,0,0,0,0,0,0


In [0]:
from pyspark.sql import functions as F

status_dist = claims_df.groupBy("Claim_Status").agg(
    F.count("*").alias("Claim_Count")
)
display(status_dist)


Claim_Status,Claim_Count
Approved,3512
Pending,517
Rejected,971


Databricks visualization. Run in Databricks to view.

In [0]:
amount_stats = claims_df.groupBy("Claim_Status").agg(
    F.round(F.avg("Claim_Amount"), 2).alias("Avg_Amount"),
    F.round(F.max("Claim_Amount"), 2).alias("Max_Amount"),
    F.round(F.min("Claim_Amount"), 2).alias("Min_Amount"),
    F.round(F.stddev("Claim_Amount"), 2).alias("Std_Amount")
)
display(amount_stats)


Claim_Status,Avg_Amount,Max_Amount,Min_Amount,Std_Amount
Approved,102377.03,199856.0,5035.44,57069.49
Pending,104791.3,199691.17,5075.76,55274.23
Rejected,104057.76,199900.93,5068.33,55469.54


Databricks visualization. Run in Databricks to view.

In [0]:
claims_df = claims_df.withColumn(
    "Age_Group",
    F.when(F.col("Age") < 30, "Below 30")
     .when((F.col("Age") >= 30) & (F.col("Age") < 50), "30-49")
     .when((F.col("Age") >= 50) & (F.col("Age") < 70), "50-69")
     .otherwise("70+")
)

age_group_summary = claims_df.groupBy("Age_Group").agg(
    F.round(F.avg("Claim_Amount"), 2).alias("Avg_Claim_Amount"),
    F.count("*").alias("Total_Claims")
).orderBy("Age_Group")

display(age_group_summary)


Age_Group,Avg_Claim_Amount,Total_Claims
30-49,100438.92,1564
50-69,103057.93,1506
70+,105130.72,923
Below 30,104705.04,1007


Databricks visualization. Run in Databricks to view.

In [0]:
region_summary = claims_df.groupBy("Region").agg(
    F.count("*").alias("Claim_Count"),
    F.round(F.avg("Claim_Amount"), 2).alias("Avg_Claim_Amount")
).orderBy(F.desc("Claim_Count"))

display(region_summary)


Region,Claim_Count,Avg_Claim_Amount
South,1040,102318.67
East,1012,103583.51
North,1004,103464.77
West,986,102585.54
Central,958,102817.77


In [0]:
claims_df = claims_df.withColumn("Month", F.date_format(F.col("Claim_Date"), "yyyy-MM"))

monthly_trend = claims_df.groupBy("Month").agg(
    F.count("*").alias("Total_Claims"),
    F.round(F.avg("Claim_Amount"), 2).alias("Avg_Claim_Amount")
).orderBy("Month")

display(monthly_trend)


Month,Total_Claims,Avg_Claim_Amount
2023-01,246,102298.84
2023-02,257,107263.12
2023-03,234,111506.62
2023-04,269,102623.08
2023-05,246,105339.58
2023-06,240,102759.82
2023-07,246,103021.42
2023-08,259,101310.56
2023-09,266,102036.32
2023-10,259,106077.03


In [0]:
import pyspark.sql.functions as F

claims_df = spark.table("workspace.default.insurance_claim_data")

claims_scored = claims_df.withColumn(
    "Claim_Risk_Score",
    F.when(
        (F.col("Claim_Amount") > 150000) & (F.col("Claim_Status") == "Rejected"), 0.9
    ).when(
        (F.col("Claim_Amount") > 100000) & (F.col("Claim_Status") == "Pending"), 0.7
    ).when(
        F.col("Claim_Amount") > 50000, 0.5
    ).otherwise(0.2)
)

display(
    claims_scored.select(
        "Claim_ID",
        "Claim_Amount",
        "Claim_Status",
        "Claim_Risk_Score"
    ).limit(10)
)

Claim_ID,Claim_Amount,Claim_Status,Claim_Risk_Score
C00001,162423.72,Approved,0.5
C00002,174379.8,Pending,0.7
C00003,56381.97,Approved,0.5
C00004,143472.46,Rejected,0.5
C00005,117065.7,Rejected,0.5
C00006,94511.86,Pending,0.5
C00007,12430.03,Approved,0.2
C00008,40829.5,Approved,0.2
C00009,91400.42,Approved,0.5
C00010,49767.15,Approved,0.2


Databricks visualization. Run in Databricks to view.

In [0]:
summary = claims_df.groupBy("Claim_Status").agg(
    F.count("*").alias("Claim_Count"),
    F.round(F.avg("Claim_Amount"), 2).alias("Avg_Claim_Amount"),
    F.round(F.sum("Claim_Amount"), 2).alias("Total_Claim_Value")
)
display(summary)

# Save for reuse or reporting
summary.write.mode("overwrite").saveAsTable("workspace.default.insurance_claim_summary")


Claim_Status,Claim_Count,Avg_Claim_Amount,Total_Claim_Value
Approved,3512,102377.03,359548145.68
Pending,517,104791.3,54177102.55
Rejected,971,104057.76,101040084.27
