In [1]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 3, Finished, Available, Finished)

In [2]:
spark.sql("DROP TABLE IF EXISTS gold.dim_region")
spark.sql("DROP TABLE IF EXISTS gold.dim_brokers")
spark.sql("DROP TABLE IF EXISTS gold.dim_clients")
spark.sql("DROP TABLE IF EXISTS gold.dim_projects")
spark.sql("DROP TABLE IF EXISTS gold.dim_properties")
spark.sql("DROP TABLE IF EXISTS gold.dim_date")
spark.sql("DROP TABLE IF EXISTS gold.fact_sales")
spark.sql("DROP TABLE IF EXISTS gold.fact_leads")
spark.sql("DROP TABLE IF EXISTS gold.sales_performance")
spark.sql("DROP TABLE IF EXISTS gold.lead_conversion")

StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 4, Finished, Available, Finished)

DataFrame[]

In [5]:
# ======================================================
# Dimension Tables (Gold)
# ======================================================

# Primero creamos dim_region (crear desde regiones existentes)
df_regions = spark.sql("""
    SELECT DISTINCT region AS region_name FROM silver.brokers
    UNION
    SELECT DISTINCT region AS region_name FROM silver.clients
    UNION
    SELECT DISTINCT region AS region_name FROM silver.projects
""")

df_dim_region = (
    df_regions
    .filter(F.col("region_name").isNotNull())
    .withColumn("region_id", F.monotonically_increasing_id())
    .select("region_id", "region_name")
)

# Dim Brokers 
df_dim_brokers = (
    spark.table("silver.brokers")
    .join(df_dim_region, 
          F.col("region") == F.col("region_name"), 
          "left")
    .select(
        F.col("broker_id"),
        F.col("broker_name"),
        F.col("region_id").alias("broker_region_id"),  # Cambiado a clave foránea
        F.col("email").alias("broker_email"),
        F.col("dominio")
    )
    .distinct()
)

# Dim Clients 
df_dim_clients = (
    spark.table("silver.clients")
    .join(df_dim_region, 
          F.col("region") == F.col("region_name"), 
          "left")
    .select(
        F.col("client_id"),
        F.col("full_name").alias("client_name"),
        F.col("email").alias("client_email"),
        F.col("region_id").alias("client_region_id")  # Cambiado a clave foránea
    )
    .distinct()
)

# Dim Projects 
df_dim_projects = (
    spark.table("silver.projects")
    .join(df_dim_region, 
          F.col("region") == F.col("region_name"), 
          "left")
    .select(
        F.col("project_id"),
        F.col("project_name"),
        F.col("city"),
        F.col("region_id").alias("project_region_id"),  # Cambiado a clave foránea
        F.col("launch_year"),
        F.col("status")
    )
    .distinct()
)

# Dim Properties 
df_dim_properties = spark.table("silver.properties").select(
    F.col("propertie_id"),
    F.col("propertie_type"),
    F.col("size_m2"),
    F.col("bedrooms"),
    F.col("bathrooms"),
    F.col("list_price_usd"),
    F.col("price_per_m2"),
    F.col("availability_status")
).distinct()

# Dim Campaigns 
df_dim_campaigns = spark.table("silver.campaigns").select(
    F.col("campaign_id"),
    F.col("campaign_name"),
    F.col("channel"),
    F.col("start_date"),
    F.col("end_date"),
    F.col("budget"),
    F.col("campaigns_duration_months"),  # Campo calculado que ya tienes en Silver
    F.col("budget_per_month")             # Campo calculado que ya tienes en Silver
).distinct()

# Dim Date (crear desde fechas existentes)
df_dates = spark.sql("""
    select distinct sale_date as date from silver.sales
    union
    select distinct lead_date as date from silver.leads
    union
    select distinct start_date as date from silver.campaigns
    union
    select distinct end_date as date from silver.campaigns where end_date is not null
""")

df_dim_date = df_dates.select(
    F.col("date").alias("date_id"),
    F.col("date"),
    F.year("date").alias("year"),
    F.quarter("date").alias("quarter"),
    F.month("date").alias("month"),
    F.dayofmonth("date").alias("day"),
    F.date_format("date", "MMMM").alias("month_name"),
    F.dayofweek("date").alias("day_of_week"),
    F.date_format("date", "EEEE").alias("day_name"),
    (F.quarter("date")-1).alias("fiscal_quarter"),
    (F.month("date")+2).alias("fiscal_month")
).distinct()


StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 7, Finished, Available, Finished)

In [6]:
# ======================================================
# Fact Tables (Gold)
# ======================================================

# Fact Sales
df_fact_sales = spark.table("silver.sales").join(
    spark.table("silver.properties"),
    "propertie_id"
).select(
    F.col("sale_id"),
    F.col("propertie_id"),
    F.col("client_id"),
    F.col("broker_id"),
    F.col("sale_date").alias("date_id"),
    F.col("sale_price_usd"),
    (F.col("sale_price_usd") - F.col("list_price_usd")).alias("price_difference"),
    F.round(F.col("sale_price_usd")/F.col("size_m2"), 2).alias("sale_price_per_m2"),
    F.lit("USD").alias("currency")
)

# Fact Leads
df_fact_leads = spark.table("silver.leads").select(
    F.col("lead_id"),
    F.col("client_id"),
    F.col("propertie_id"),
    F.col("campaign_id"),
    F.col("lead_date").alias("date_id"),
    F.col("lead_source")
)



StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 8, Finished, Available, Finished)

In [7]:
# ======================================================
# Metricas de Negocio (Gold)
# ======================================================

# Sales Performance
df_sales_performance = df_fact_sales.groupBy(
    "broker_id", "date_id"
).agg(
    F.count("*").alias("total_sales"),
    F.sum("sale_price_usd").alias("total_revenue"),
    F.avg("price_difference").alias("avg_price_difference")
)

# Lead Conversion
df_lead_conversion = df_fact_leads.join(
    df_fact_sales,
    (df_fact_leads.client_id == df_fact_sales.client_id) & 
    (df_fact_leads.propertie_id == df_fact_sales.propertie_id),
    "left"
).groupBy(
    "lead_source", "campaign_id"
).agg(
    F.count("*").alias("total_leads"),
    F.count(df_fact_sales.sale_id).alias("converted_leads"),
    (F.count(df_fact_sales.sale_id)/F.count("*")).alias("conversion_rate")
)



StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 9, Finished, Available, Finished)

In [8]:
# ======================================================
# Guardar en capa Gold
# ======================================================

# Dimension Tables
df_dim_region.write.format("delta").mode("overwrite").saveAsTable("gold.dim_region") 
df_dim_brokers.write.format("delta").mode("overwrite").saveAsTable("gold.dim_brokers")
df_dim_clients.write.format("delta").mode("overwrite").saveAsTable("gold.dim_clients")
df_dim_properties.write.format("delta").mode("overwrite").saveAsTable("gold.dim_properties")
df_dim_projects.write.format("delta").mode("overwrite").saveAsTable("gold.dim_projects")
df_dim_campaigns.write.format("delta").mode("overwrite").saveAsTable("gold.dim_campaigns")
df_dim_date.write.format("delta").mode("overwrite").saveAsTable("gold.dim_date")

# Fact Tables
df_fact_sales.write.format("delta").mode("overwrite").saveAsTable("gold.fact_sales")
df_fact_leads.write.format("delta").mode("overwrite").saveAsTable("gold.fact_leads")

# Business Metrics
df_sales_performance.write.format("delta").mode("overwrite").saveAsTable("gold.sales_performance")
df_lead_conversion.write.format("delta").mode("overwrite").saveAsTable("gold.lead_conversion")

StatementMeta(, c41e1fe6-e010-4bb7-bbdc-7d6e4a961f8b, 10, Finished, Available, Finished)