In [0]:

from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
import time

In [0]:
catalog_name = "retail_dev"
schema_bronze = "bronze"
schema_silver = "silver"
schema_gold = "gold"
schema_auditoria ="auditoria"

In [0]:
dim_tiempo = (
    spark.table(f"{catalog_name}.{schema_silver}.ventas")
    .select(
        col("fecha_venta").alias("fecha")
    ).dropna().distinct()
    .withColumn("anio", year(col("fecha")))
    .withColumn("mes", month(col("fecha")))
    .withColumn("dia", dayofmonth(col("fecha")))
    .withColumn("semestre", concat_ws("-",col("anio"), when(col("mes") <=7, lit("01")).otherwise(lit("02"))))
    .withColumn("id_tiempo", (col("anio")*10000 + col("mes")*100 + col("dia")).cast("int"))
)


In [0]:
display(dim_tiempo)

In [0]:
dim_tiempo.write.mode("overwrite").format("delta").saveAsTable(f"{catalog_name}.{schema_gold}.dim_tiempo")

In [0]:
spark.table(f"{catalog_name}.{schema_gold}.dim_tiempo").show(10)

In [0]:

sql_command = f"""
  OPTIMIZE {catalog_name}.{schema_gold}.dim_tiempo
  ZORDER BY (id_tiempo)
"""

In [0]:
spark.sql(sql_command)

In [0]:
spark.sql(f"VACUUM {catalog_name}.{schema_gold}.dim_tiempo")