In [0]:
spark.sql("use globalretail_silver")
spark.sql("""
create table if not exists silver_customers (
    customer_id string,
    name string,
    email string,
    country string,
    customer_type string,
    registration_date date,
    age int,
    gender string,
    total_purchases int,
    customer_segment string,
    days_since_registration int,
    last_updated timestamp
)
""")

In [0]:
spark.sql("use globalretail_silver")
last_processed_df = spark.sql("select max(last_updated) as last_processed from silver_customers")

last_processed_timestamp = last_processed_df.collect()[0]['last_processed']

if last_processed_timestamp is None:
  last_processed_timestamp = '1900-01-01 00:00:00'


In [0]:
spark.sql(f"""
create or replace temporary view bronze_incremental as
select * from globalretail_bronze.bronze_customer c
where c.ingestion_timestamp > '{last_processed_timestamp}'
""")

In [0]:
df = spark.sql("select * from bronze_incremental")
display(df)

In [0]:
from pyspark.sql.functions import col, when

df_email_validation = df.withColumn(
    "email_valid",
    when(col("email").isNull(), "NULL").otherwise("NOT NULL")
)
display(df_email_validation)

In [0]:
df_age_validation = df_email_validation.withColumn(
    "age_valid",
    when((col("age") >= 18) & (col("age") <= 100), "VALID").otherwise("INVALID")
)
display(df_age_validation)

In [0]:
df_segmented = df_age_validation.withColumn(
    "customer_segment",
    when(col("total_purchases") > 10000, "High Value")
    .when(col("total_purchases") > 5000, "Medium Value")
    .otherwise("Low Value")
)
display(df_segmented)

In [0]:
from pyspark.sql.functions import datediff, current_date

df_with_days_since_registration = df_segmented.withColumn(
    "days_since_registration",
    datediff(current_date(), col("registration_date"))
)
display(df_with_days_since_registration)

In [0]:
df_cleaned = df_with_days_since_registration.filter(col("total_purchases") >= 0)
display(df_cleaned)