In [0]:
from pyspark.sql.functions import lower, col, trim, when, coalesce, lit, round, to_date, year, month, sum, countDistinct, avg,upper,first
from pyspark.sql.types import DoubleType

In [0]:
catalog = "dmc_01"
schema_name = "gold_sales"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema_name}")

In [0]:
path_customers = "/Volumes/dmc_01/gold_sales/volumes/taller_02/customers.csv"
path_products = "/Volumes/dmc_01/gold_sales/volumes/taller_02/products.csv"
path_sales = "/Volumes/dmc_01/gold_sales/volumes/taller_02/sales.csv"

df_customers = spark.read.option("header", True).option("inferSchema", True).csv(path_customers)
df_products = spark.read.option("header", True).option("inferSchema", True).csv(path_products)
df_sales = spark.read.option("header", True).option("inferSchema", True).csv(path_sales)


In [0]:
df_cust_clean = (df_customers.dropna()
                        .withColumn("first_name", trim(lower(col("First_Name"))))
                        .withColumn("last_name", trim(upper(col("Last_Name"))))
                        .withColumn("Customer_ID", col("Customer_Id").cast("integer")))

In [0]:
display(df_cust_clean)

In [0]:
df_prod_clean = (df_products.dropna()
                        .withColumn("Price", col("Price").cast("double"))
                        .withColumn("Product_Name", trim(col("Product_Name"))))

In [0]:
df_prod_clean.display()

In [0]:
df_sales_clean = (df_sales.dropna()
                          .withColumn("Customer_ID", col("Customer_ID").cast("integer"))
                          .withColumn("Quantity", col("Quantity").cast("integer"))
                          .withColumn("Date", to_date(col("Date")))
                          .withColumn("anio_venta", year(col("Date")))
                          .withColumn("mes_venta", month(col("Date")))
                          .withColumn("Price_per_Unit", col("Price_per_Unit").cast("double"))
                          .withColumn("Total", when(col("Total").isNull(),
                                                   col("Quantity") * col("Price_per_Unit"))
                                        .otherwise(col("Total"))))

In [0]:
df_sales_clean.display()

In [0]:
df_joined = (df_sales_clean
             .join(df_cust_clean, on="Customer_ID", how="left")
             .join(df_prod_clean, df_sales_clean["Product_ID"] == df_prod_clean["Product_ID"], how="left")
             .select(
                 df_cust_clean.Customer_ID,
                 df_cust_clean.first_name,
                 df_cust_clean.last_name,
                 df_cust_clean.Country,
                 df_prod_clean.Product_ID,
                 df_prod_clean.Product_Name,
                 df_prod_clean.Category,
                 df_prod_clean.Price,
                 df_sales_clean.Transaction_ID,
                 df_sales_clean.Date,
                 df_sales_clean.anio_venta,
                 df_sales_clean.mes_venta,
                 df_sales_clean.Quantity,
                 df_sales_clean.Price_per_Unit,
                 df_sales_clean.Total

             )
             )

In [0]:
df_joined.display()

In [0]:
df_Transactions_customer = (df_joined.groupBy("Customer_ID")
                   .agg(first("first_name").alias("First_Name"),
                        first("last_name").alias("Last_Name"),
                        round(sum("Total"),4).alias("Total_Spent"),
                        countDistinct("Transaction_ID").alias("Num_Transactions")))

In [0]:
df_Transactions_customer.display()

In [0]:
df_Transactions_Category = (df_joined.groupBy("Category")
                   .agg(round(sum("Total"),4).alias("Total_Spent"),
                        countDistinct("Transaction_ID").alias("Num_Transactions")))

In [0]:
df_Transactions_Category.display()

In [0]:
df_Transactions_Country = (df_joined.groupBy("Country")
                   .agg(round(sum("Total"),4).alias("Total_Spent"),
                        countDistinct("Transaction_ID").alias("Num_Transactions")))

In [0]:
df_Transactions_Country.display()

In [0]:
tbl_detalle = f"{catalog}.{schema_name}.ventas_detalle"
tbl_customer = f"{catalog}.{schema_name}.transacciones_customer"
tbl_category = f"{catalog}.{schema_name}.transacciones_category"
tbl_country = f"{catalog}.{schema_name}.transacciones_country"


In [0]:
df_joined.write.format("delta").mode("overwrite").saveAsTable(tbl_detalle)

In [0]:
df_joined.write.format("delta").mode("overwrite").partitionBy("anio_venta", "mes_venta").save("/Volumes/dmc_01/gold_sales/volumes/taller_02/ventas_detalle/")

In [0]:
df_Transactions_Country.write.format("delta").mode("overwrite").saveAsTable(tbl_country)

In [0]:
df_Transactions_Category.write.format("delta").mode("overwrite").saveAsTable(tbl_category)

In [0]:
df_Transactions_customer.write.format("delta").mode("overwrite").saveAsTable(tbl_customer)

In [0]:
%sql
select * from dmc_01.gold_sales.ventas_detalle

In [0]:
%sql
select * from dmc_01.gold_sales.transacciones_customer

In [0]:
%sql
select * from dmc_01.gold_sales.transacciones_category

In [0]:
%sql
select * from dmc_01.gold_sales.transacciones_country

In [0]:
%sql
CREATE TABLE IF NOT EXISTS dmc_01.gold_sales.audit_ingestion (
    audit_id STRING,
    run_id STRING,
    source_system STRING,
    source_path STRING,
    records_read BIGINT,
    records_processed BIGINT,
    target_table STRING,
    status STRING,
    started_at TIMESTAMP,
    ended_at TIMESTAMP,
    duration_ms BIGINT,
    extra_metadata STRING
)
USING delta
