In [0]:
from datetime import datetime

# Widget inputs
container_name = dbutils.widgets.get("container_name")
storage_account_name = dbutils.widgets.get("storage_account_name")
base_folder_name = dbutils.widgets.get("base_folder_name")
storage_account_key = dbutils.secrets.get(scope = "zillowsecrets",key="storage_account_key")
main_folder_name = dbutils.widgets.get("main_folder_name")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

# Configure access
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

# Construct base path
folder_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{base_folder_name}/{main_folder_name}"

In [0]:
items = dbutils.fs.ls(folder_path)

if len(items) == 0:
    dbutils.jobs.taskValues.set("run_now", "false")
    dbutils.notebook.exit("No new files to process")
else:
    print("Files are available")
    dbutils.jobs.taskValues.set("run_now", "true")

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configure access
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

# Define the schema
StructTypeSchema = StructType([
    StructField("zpid", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zipcode", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("price", StringType(), True),
    StructField("zestimate", StringType(), True),
    StructField("bedrooms", StringType(), True),
    StructField("bathrooms", StringType(), True),
    StructField("livingAreaValue", StringType(), True),
    StructField("yearBuilt", StringType(), True),
    StructField("homeType", StringType(), True),
    StructField("description", StringType(), True),
    StructField("imgSrc", StringType(), True),
    StructField("agent_name", StringType(), True),
    StructField("agent_phone", StringType(), True),
    StructField("datePosted", StringType(), True),
    StructField("DateSold", StringType(), True)
])

# Read the JSON files into a DataFrame
df = spark.read.schema(StructTypeSchema) \
    .option("multiline", "true") \
    .option("header", "true") \
    .json(folder_path)

# Display the DataFrame
display(df)



In [0]:
import urllib.parse
decode = udf(lambda x: urllib.parse.unquote(x), StringType())
df = df.withColumn("file_name", decode(regexp_extract(input_file_name(), r"([^/]+$)", 1)))
display(df)

In [0]:
df.write.format('delta').option("mergeSchema", "true").mode("append").saveAsTable(f'{catalog}.{schema}.property_bronze')

In [0]:
%sql
select * from ${catalog}.${schema}.property_bronze;

In [0]:
dbutils.jobs.taskValues.set(key="error", value=str(e))  # 'e' is the exception