## 1. Data Ingestion

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col

In [0]:
# Load input datasets
customers_df = spark.read.csv("/Volumes/workspace/default/bronze/customers.csv", header=True, inferSchema=True)
claims_df = spark.read.csv("/Volumes/workspace/default/bronze/claims.csv",header=True,inferSchema=True)

print("Customers Datasets:")
customers_df.show()

print("Claims Datasets:")
claims_df.show()

### 2. Data Quality & Integrity

### 1. Handle null values appropriately.

In [0]:
# checking if null value is present inside the column

# for customers
# customers_df.filter(customers_df.name.isNull()).show()

# # for claims
# claims_df.filter(claims_df.claim_amount.isNull()).show()

### Handling nulls

In [0]:
### drop the rows 
customers_drop = customers_df.dropna()
claims_drop = claims_df.dropna()

# customers_drop.show()
# claims_drop.show()

In [0]:
# Fill nulls with different values per column
df_filled = customers_df.fillna({"city": 0, "name": "Unknown"})
df_filled = claims_df.fillna({"claim_amount": 0, "hospital_name": "Unknown"})

### 2. Deduplicate duplicate claims

In [0]:
# checking the duplicate values
customers_dup = customers_df.groupBy("customer_id","name", "city", "state").count().filter(col("count") > 1).show()
customers_dup = claims_df.groupBy("claim_amount","insured_amount", "hospital_name", "state").count().filter(col("count") > 1).show()

In [0]:
# identifying duplicates based on the business definition (all columns or key columns) and then using .dropDuplicates() or a Window function to keep only one record.
# Deduplicate duplicate claims
# claims_deduped = claims_df.dropDuplicates(["customer_id", "policy_id", "claim_date", "hospital_name"]).show()

### 3. Validate column data types (e.g., numeric fields must be numeric).

In [0]:
# customers_df = customers_df.withColumn("customer_id", col("customer_id").cast("double")).show()
# claims_df = claims_df.withColumn("claim_amount", col("claim_amount").cast("double")).show()

### 4. Ensure referential integrity: each claim must map to a valid customer.

In [0]:
#Keeps only claims where customer exists.
valid_claims = claims_df.join(customers_df, "customer_id", "inner")
valid_claims.show()

#Find claims with no matching customer
invalid_claims = claims_df.join(customers_df, "customer_id", "left_anti")
invalid_claims.show()