In [0]:
%sql
use catalog maven_catalog;
use schema bronze_schema;

In [0]:
# 1. Load Bronze Layer Data
spark.sql("USE CATALOG maven_catalog")
bronze_df = spark.read.format("delta").table("maven_catalog.maven_market_landing.customers")

display(bronze_df)

In [0]:

# 2. Data Cleaning (Silver Preparation)

# Remove null users
clean_df = bronze_df.filter(col("user_id").isNotNull())

# Remove duplicates
clean_df = clean_df.dropDuplicates(["user_id", "event_time", "event_type"])

In [0]:

# 3. Create User-Level Feature Table

feature_df = clean_df.groupBy("user_id").agg(
    count("*").alias("total_events"),
    countDistinct("event_type").alias("unique_event_types"),
    max("event_time").alias("last_activity_time"),
    avg("amount").alias("avg_amount"),
    sum("amount").alias("total_amount")
)

display(feature_df)

In [0]:

# 4. Feature Quality Validation


# Check duplicates
dup_check = feature_df.groupBy("user_id").count().filter(col("count") > 1)
print("Duplicate Users:", dup_check.count())

# Check nulls
null_counts = feature_df.select([
    count(when(col(c).isNull(), c)).alias(c) for c in feature_df.columns
])
display(null_counts)

In [0]:

# 5. Save as Silver Layer (Delta Table)

feature_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver.user_features")