## Purpose:
This notebook cleans the data, flattens the nested structures, and performs geospatial enrichment.

## Add constraints to the customers before stream

In [0]:
%sql
-- Ensure transaction amounts are never zero or negative
---ALTER TABLE fraud_sentinel_catalog.detection_service.silver_transactions 
--ADD CONSTRAINT check_positive_amount CHECK (amount > 0);

-- Ensure we have valid GPS coordinates
--ALTER TABLE fraud_sentinel_catalog.detection_service.silver_transactions 
--ADD CONSTRAINT check_valid_lat CHECK (txn_lat BETWEEN -90 AND 90);

### Environment & Function Setup


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import math

# Haversine Formula: Calculate distance between two points in KM
def haversine_distance(lat1, lon1, lat2, lon2):
    # Earth radius in KN
    R = 6371.0

    # Convert degrees to radians. GPS locations are in degrees, Trigonometry functions (sin, cos) require radian
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)

    a = math.sin(dphi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

# Register the UDF so can use on Spark Stream
haversine_udf = udf(haversine_distance, DoubleType())


## Merging Transactions with Customer Profiles
To calculate distance, we need the `Home Location` of our 1,000 custoers. Then join **Bronze Stream** with the **Customer Profiles**

In [0]:
# Access the Static Customer Table (Dimension)
customer_ref = spark.table("fraud_sentinel_catalog.detection_service.customer_registry")

## Access the Streaming Bronze Table (Fact)
bronze_stream = spark.readStream.table("fraud_sentinel_catalog.detection_service.bronze_transactions")

## Tranform and Enrich
silver_stream = (bronze_stream
                 .select(
                     "transaction_id",
                     "customer_id",
                     "amount",
                     "device_id",
                     "recipient_status",
                     col("location.lat").alias("txn_lat"),
                     col("location.lon").alias("txn_lon"),
                     to_timestamp("timestamp").alias("event_time"),
                     "_ingested_at"
                 )
                 # Join with registry to get home coordinates
                 .join(customer_ref, "customer_id", "inner")
                 # Calculate distance usinfg udf
                 .withColumn("dist_from_home_km", haversine_udf(col("txn_lat"), col("txn_lon"), col("home_lat"), col("home_lon")))
)
          

## Write to Silver Table


In [0]:
# --- STEP 3: PERSIST ENRICHED DATA (Free Tier Compatible) --- 
# Always write comments [cite: 2026-02-22]

# Define the checkpoint path
SILVER_CHECKPOINT = "abfss://fraud-sentinel@giftmapote2ete.dfs.core.windows.net/_checkpoints/silver"

# Use availableNow=True instead of ProcessingTime for Free Tier
(silver_stream.writeStream
    .format("delta")
    .option("checkpointLocation", SILVER_CHECKPOINT)
    .trigger(processingTime='10 seconds') # This processes everything currently waiting
    .toTable("fraud_sentinel_catalog.detection_service.silver_transactions"))

In [0]:
%sql
-- SELECT 
--     customer_id, 
--     dist_from_home_km, 
--     amount, 
--     home_province 
-- FROM fraud_sentinel_catalog.detection_service.silver_transactions 
-- WHERE dist_from_home_km > 500
-- ORDER BY dist_from_home_km DESC