####Setup runtime variables

In [0]:
dbutils.widgets.text("env", "dev")
dbutils.widgets.text("dataset", "")

env = dbutils.widgets.get("env")
dataset = dbutils.widgets.get("dataset")

if not dataset:
    raise ValueError("Dataset name must be provided")

####Load Config

In [0]:
%run "/Workspace/Users/azuredataengineer44@gmail.com/databricks-traffic/Databricks Retail Notebooks/common/config_loader"

In [0]:
config = load_config(env)

In [0]:
%skip
from pprint import pprint

pprint(config)


In [0]:
valid_datasets = config["paths"].keys()

if dataset not in valid_datasets:
    raise ValueError(f"Invalid dataset: {dataset}. Valid values: {valid_datasets}")


In [0]:
%skip
print (config["paths"].keys())

In [0]:
storage = config["storage"]

source_path = (
    f"abfss://{storage['container']}"
    f"@{storage['account']}.dfs.core.windows.net/"
    f"{config['paths'][dataset]}"
)

In [0]:
%skip
# print(config["storage"])
print (source_path)

In [0]:
autoloader = config["autoloader"]

checkpoint_path = f"{autoloader['checkpoint_base']}/{dataset}_raw"
schema_path = f"{autoloader['schema_base']}/{dataset}_raw"

# print(autoloader)
# print(checkpoint_path)
# print(schema_path)
print("Streaming Started For Data: "+ dataset +"_raw")

df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", autoloader["format"])
    .option("cloudFiles.schemaLocation", schema_path)
    .option("cloudFiles.inferColumnTypes", "true")
    .load(source_path)
)

print("Successfully Read Streaming Data: "+ dataset +"_raw")
print("********************************************")

In [0]:
from pyspark.sql.functions import current_timestamp,col

df_bronze =  (
    df
    .withColumn("ingestion_ts",current_timestamp())
    .withColumn("source_file_path", col("_metadata.file_path"))
)

In [0]:
uc = config["unity_catalog"]
table_name = f"{uc['catalog']}.{uc['schema']}.{dataset}_raw"

In [0]:
(
    df_bronze.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)
    .trigger(availableNow=True)
    .toTable(table_name)
)


In [0]:
# print(df_bronze._metadata.file_path)

In [0]:
spark.sql("""select * from dev_catalog.bronze.customers_raw""").display()