In [31]:
#Início da implementação da Gold Layer
#remoção dos duplicador, definição da base da dimensão

from pyspark.sql.functions import col

df_dates = (
    spark.read.table("silver_sales")
    .select(col("sales_date").alias("date"))
    .distinct()
)

#Gerar atributos temporais - Modelagem analítica
from pyspark.sql.functions import (
    year, month, weekofyear, date_format, dayofweek
)

df_dim_date = (
    df_dates
    .withColumn("date_key", date_format(col("date"), "yyyyMMdd").cast("int"))
    .withColumn("year", year(col("date")))
    .withColumn("month", month(col("date")))
    .withColumn("month_name", date_format(col("date"), "MMMM"))
    .withColumn("week_of_year", weekofyear(col("date")))
    .withColumn("day_of_week", dayofweek(col("date")))
)

#Escrita e validação da dimensão Gold
(
    df_dim_date
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("dim_date")
)

#Validação
spark.read.table("dim_date").show(10, truncate=False)
spark.read.table("dim_date").printSchema()




StatementMeta(, 793c2255-ce12-40f3-a069-e5fbbbd49603, 33, Finished, Available, Finished)

In [None]:
#Stores. Fonte: silver_stores. Destino: dim_store

#ler a Silver Stores
df_dim_store = spark.read.table("silver_stores")

#selecionar colunas (já prontas)
df_dim_store = df_dim_store.select(
    "store_id",
    "store_type",
    "store_size"
)

#Escrever a dimensão Gold
(
    df_dim_store
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("dim_store")
)

#Validação
spark.read.table("dim_store").show(10, truncate=False)
spark.read.table("dim_store").printSchema()

In [None]:
#dim_department
#Extrair departamentos distintos

from pyspark.sql.functions import col

df_dim_department = (
    spark.read.table("silver_sales")
    .select(col("department_id"))
    .distinct()
)

#escrever a dimensão
(
    df_dim_department
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("dim_department")
)

#Validar
spark.read.table("dim_department").show(10)
spark.read.table("dim_department").printSchema()

In [3]:
#Criar fact_weekly_sales
#Criação da tabela de análise, integrando Vendas Semanais, Variações exógenas, chaves para as dimensões
#fontes: Silver sales, silver features (LEFT JOIN), dim_data (date_key)
#Grão store x department x week

#ler as Silver
df_sales = spark.read.table("silver_sales")
df_features = spark.read.table("silver_features")

from pyspark.sql.functions import date_format, col

#Join Sales+ Features
df_fact_base = (
    df_sales.alias("s")
    .join(
        df_features.alias("f"),
        (col("s.store_id") == col("f.store_id")) &
        (col("s.sales_date") == col("f.feature_date")),
        "left"
    )
)

#left JOIN intencional - nem toda venda tem feature. fact não perde vendas
#Adicionar date_key

df_fact = df_fact_base.withColumn(
    "date_key",
    date_format(col("s.sales_date"), "yyyyMMdd").cast("int")
)

#selecionar colunas finals da fact
df_fact_final = df_fact.select(
    col("s.store_id"),
    col("s.department_id"),
    col("date_key"),
    col("s.weekly_sales"),
    col("s.is_holiday"),
    col("f.temperature"),
    col("f.fuel_price"),
    col("f.markdown_1"),
    col("f.markdown_2"),
    col("f.markdown_3"),
    col("f.markdown_4"),
    col("f.markdown_5"),
    col("f.cpi"),
    col("f.unemployment")
)

#escrever a fact
(
    df_fact_final
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("fact_weekly_sales")
)


#Validação
spark.read.table("fact_weekly_sales").show(10, truncate=False)
spark.read.table("fact_weekly_sales").printSchema()

StatementMeta(, 07117458-5f6a-48b2-a47d-85bcef4372c2, 5, Finished, Available, Finished)

+--------+-------------+--------+------------+----------+-----------+----------+----------+----------+----------+----------+----------+-------+------------+
|store_id|department_id|date_key|weekly_sales|is_holiday|temperature|fuel_price|markdown_1|markdown_2|markdown_3|markdown_4|markdown_5|cpi    |unemployment|
+--------+-------------+--------+------------+----------+-----------+----------+----------+----------+----------+----------+----------+-------+------------+
|1       |92           |20111125|186516.35   |true      |60.14      |3.236     |410.31    |98.00     |55805.51  |8.00      |554.92    |218.468|7.87        |
|2       |92           |20111125|238066.17   |true      |56.36      |3.236     |919.71    |62.00     |77451.26  |23.00     |1589.43   |218.113|7.44        |
|3       |92           |20111125|9144.72     |true      |68.00      |3.236     |41.74     |10.47     |37612.74  |NULL      |135.16    |221.901|7.20        |
|4       |92           |20111125|231334.61   |true      |4