In [0]:
from pyspark.sql.functions import explode, col, current_timestamp, coalesce, lower, regexp_extract


In [0]:
dbutils.fs.ls("abfss://landing@acmemcd.dfs.core.windows.net")

In [0]:
crm_df = spark.read.format("json").load("abfss://landing@acmemcd.dfs.core.windows.net/crm")

In [0]:
crm_df.printSchema()
display(crm_df)

In [0]:
flat_df = crm_df.select(
    col("account_id").cast("int").alias("account_id"),
    lower(col("addr_line1").cast("string")).alias("address"),
    lower(col("city").cast("string")).alias("city"),
    col("contact.contact_id").cast("string").alias("contact_id"),
    lower(col("contact.email").cast("string")).alias("email"),
    col("contact.first_name").cast("string").alias("first_name"),
    col("contact.last_name").cast("string").alias("last_name"),
    coalesce(col("contact.phone_primary"),col("contact.primary_phone")).cast("string").alias("phone"),
    col("country_code").cast("string").alias("country_code"),
    col("postal").cast("string").alias("postal_code"),
    col("updated_at").cast("timestamp"),
    col("dt").cast("date")
).withColumn("ingestion_date", current_timestamp())
display(flat_df)


In [0]:
invalid_condition = (
    col("account_id").isNull() |
    col("email").isNull() |
    col("country_code").isin("XX") |
    col("phone").isNull()|
    (regexp_extract(col("phone"), "[A-Za-z]", 0) != "")
)

In [0]:
valid_df = flat_df.filter(~invalid_condition)
display(valid_df)

In [0]:
quarantine_df = flat_df.filter(invalid_condition)
display(quarantine_df)