# Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Widgets

In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.text("catalog", "catalog_supermarket")
dbutils.widgets.text("schema_bronze", "bronze")
dbutils.widgets.text("schema_raw", "raw")

print("catalog      :", dbutils.widgets.get("catalog"))
print("schema_bronze:", dbutils.widgets.get("schema_bronze"))
print("schema_raw   :", dbutils.widgets.get("schema_raw"))

#Constants

In [0]:
catalog       = dbutils.widgets.get("catalog")
schema_bronze = dbutils.widgets.get("schema_bronze")
schema_raw    = dbutils.widgets.get("schema_raw")

#Path

In [0]:
# OJO: aqu√≠ *solo* usamos el volumen
path_base   = f"/Volumes/{catalog}/{schema_raw}/raw_files/"
path_orders = f"{path_base}orders.csv"

display(path_orders)

#Exploration

In [0]:
df_preview = (
    spark.read
         .option("header", True)
         .csv(path_orders)
)

df_preview.show(5)
df_preview.printSchema()


#Structures

In [0]:
orders_schema = StructType([
    StructField("order_id"              , IntegerType(), False),
    StructField("user_id"               , IntegerType(), False),
    StructField("eval_set"              , StringType(),  True),
    StructField("order_number"          , IntegerType(), True),
    StructField("order_dow"             , IntegerType(), True),
    StructField("order_hour_of_day"     , IntegerType(), True),
    StructField("days_since_prior_order", IntegerType(), True)
])

#Read Resource

In [0]:
df_orders = (
    spark.read
         .option("header", True)
         .schema(orders_schema)
         .csv(path_orders)
)

In [0]:
display(df_orders.limit(10))
df_orders.printSchema()

#Save

In [0]:
(
    df_orders
      .write
      .mode("overwrite")
      .saveAsTable(f"{catalog}.{schema_bronze}.orders")
)

print(f"Tabla creada/actualizada: {catalog}.{schema_bronze}.orders")

In [0]:

display(
    spark.table(f"{catalog}.{schema_bronze}.orders").limit(10)
)