# 02 — Compute Cost Avoidance & Cost Reduction (curated → facts)

Este cuaderno calcula **Cost Avoidance** (fase *Before_RtB*, Construction)
y **Cost Reduction** (fase *After_RtB*, Construction; y AnyButConstruction por PRQ),
replicando las reglas del cuaderno funcional y dejando resultados en tablas *fact*.

**Entradas esperadas:**
- `cur.bid_construction_curated`
- `cur.bid_anybutconstruction_curated`

**Salidas:**
- `fact.cost_avoidance_construction_project`
- `fact.cost_reduction_construction_project`
- `fact.cost_reduction_anybut_prq`
- `mart.kpi_purchasing_totals` (agregados globales para Power BI)

In [None]:

# Parameters 
CATALOG = ""          
DB_CUR = "cur"
DB_FACT = "fact"
DB_MART = "mart"

IN_CONSTRUCTION = f"{DB_CUR}.bid_construction_curated"
IN_ANYBUT = f"{DB_CUR}.bid_anybutconstruction_curated"

OUT_CAO = f"{DB_FACT}.cost_avoidance_construction_project"
OUT_CRED_CONS = f"{DB_FACT}.cost_reduction_construction_project"
OUT_CRED_ANY = f"{DB_FACT}.cost_reduction_anybut_prq"
OUT_KPI_TOTALS = f"{DB_MART}.kpi_purchasing_totals"

def qual(tbl):
    return f"{CATALOG}.{tbl}" if CATALOG else tbl


## 1) Cargar entradas curadas

In [None]:

from pyspark.sql import functions as F

df_cons = spark.table(qual(IN_CONSTRUCTION))
df_any = spark.table(qual(IN_ANYBUT))

needed_cons = ["Project ID","Project phase","Price_USD","Purchase Order","RtB Budget"]
needed_any = ["PRQ Code","Price_USD","PO (Y/N)","Round Number","Winner"]

miss_c = [c for c in needed_cons if c not in df_cons.columns]
miss_a = [c for c in needed_any if c not in df_any.columns]
if miss_c or miss_a:
    raise ValueError(f"Faltan columnas en curated: construction={miss_c}, anybut={miss_a}")


## 2) Cost Avoidance — Construction · *Before_RtB*

In [None]:

df_before = df_cons.where(F.col("Project phase")=="Before_RtB").select("Project ID","Price_USD")

# median y min por proyecto
med = (df_before
       .groupBy("Project ID")
       .agg(F.expr("percentile_approx(Price_USD, 0.5)").alias("median"),
            F.min("Price_USD").alias("min")))

cao = (med
       .withColumn("Cost Avoidance", F.col("median") - F.col("min"))
       .withColumn("Cost Avoidance (%)", F.when(F.col("median")>0,
                                                (F.col("Cost Avoidance")/F.col("median"))*100).otherwise(None))
      )

cao.createOrReplaceTempView("vw_cao")
spark.sql("CREATE DATABASE IF NOT EXISTS fact")
spark.sql(f"DROP TABLE IF EXISTS {OUT_CAO}")
spark.sql(f"CREATE TABLE {OUT_CAO} AS SELECT * FROM vw_cao")

total_cao_pct = (cao.agg((F.sum("Cost Avoidance")/F.sum("median")*100).alias("pct")).collect()[0]["pct"])
print(f"Total Cost Avoidance (%): {round(total_cao_pct,2) if total_cao_pct is not None else None}")


## 3) Cost Reduction — Construction · *After_RtB* con PO

In [None]:

df_after = (df_cons
            .where((F.col("Project phase")=="After_RtB") & F.col("Purchase Order").isNotNull())
            .select("Project ID","Price_USD","RtB Budget"))

# Si hay múltiples precios por proyecto tras RtB, tomamos el precio contratado (asumimos Purchase Order != null)
# Si existieran varios, nos quedamos con el mínimo Price_USD por prudencia 
cr_cons = (df_after
           .groupBy("Project ID")
           .agg(F.first("RtB Budget", ignorenulls=True).alias("RtB Budget"),
                F.min("Price_USD").alias("PO Price")))

cr_cons = (cr_cons
           .withColumn("Cost Reduction", F.col("RtB Budget") - F.col("PO Price"))
           .withColumn("Cost Reduction (%)", F.when(F.col("RtB Budget")>0,
                                                   (F.col("Cost Reduction")/F.col("RtB Budget"))*100).otherwise(None)))

cr_cons.createOrReplaceTempView("vw_cr_cons")
spark.sql(f"DROP TABLE IF EXISTS {OUT_CRED_CONS}")
spark.sql(f"CREATE TABLE {OUT_CRED_CONS} AS SELECT * FROM vw_cr_cons")

total_cr_cons_pct = (cr_cons.agg((F.sum("Cost Reduction")/F.sum("RtB Budget")*100).alias("pct")).collect()[0]["pct"])
print(f"Total Cost Reduction Construction (%): {round(total_cr_cons_pct,2) if total_cr_cons_pct is not None else None}")


## 4) Cost Reduction — AnyButConstruction · por PRQ (1ª ronda vs PO)

In [None]:

df_any_round1 = (df_any
                 .where(F.col("Round Number")==1)
                 .groupBy("PRQ Code")
                 .agg(F.min("Price_USD").alias("Lowest Price 1round")))

df_any_po = (df_any
             .where(F.col("PO (Y/N)")=="Yes")
             .select("PRQ Code","Price_USD")
             .groupBy("PRQ Code")
             .agg(F.min("Price_USD").alias("PO Price")))

cr_any = (df_any_round1.join(df_any_po, on="PRQ Code", how="left")
          .withColumn("Cost Reduction", F.col("Lowest Price 1round") - F.col("PO Price"))
          .withColumn("Cost Reduction (%)", F.when(F.col("Lowest Price 1round")>0,
                                                  (F.col("Cost Reduction")/F.col("Lowest Price 1round"))*100).otherwise(None)))

cr_any.createOrReplaceTempView("vw_cr_any")
spark.sql("CREATE DATABASE IF NOT EXISTS fact")
spark.sql(f"DROP TABLE IF EXISTS {OUT_CRED_ANY}")
spark.sql(f"CREATE TABLE {OUT_CRED_ANY} AS SELECT * FROM vw_cr_any")

total_cr_any_pct = (cr_any.agg((F.sum("Cost Reduction")/F.sum("PO Price")*100).alias("pct")).collect()[0]["pct"])
print(f"Total Cost Reduction AnyBut (%): {round(total_cr_any_pct,2) if total_cr_any_pct is not None else None}")


## 5) Agregados para Power BI (mart.kpi_purchasing_totals)

In [None]:

from pyspark.sql import functions as F

totals = (cao.agg(F.sum("Cost Avoidance").alias("sum_cao"),
                  F.sum("median").alias("sum_median"))
          .crossJoin(cr_cons.agg(F.sum("Cost Reduction").alias("sum_cr_cons"),
                                 F.sum("RtB Budget").alias("sum_budget")))
          .crossJoin(cr_any.agg(F.sum("Cost Reduction").alias("sum_cr_any"),
                                F.sum("PO Price").alias("sum_po")))
         )

totals = (totals
          .withColumn("Total CAO (%)", F.when(F.col("sum_median")>0, F.col("sum_cao")/F.col("sum_median")*100))
          .withColumn("Total CR Construction (%)", F.when(F.col("sum_budget")>0, F.col("sum_cr_cons")/F.col("sum_budget")*100))
          .withColumn("Total CR AnyBut (%)", F.when(F.col("sum_po")>0, F.col("sum_cr_any")/F.col("sum_po")*100))
         )

totals.createOrReplaceTempView("vw_kpi_totals")

spark.sql("CREATE DATABASE IF NOT EXISTS mart")
spark.sql(f"DROP TABLE IF EXISTS {OUT_KPI_TOTALS}")
spark.sql(f"CREATE TABLE {OUT_KPI_TOTALS} AS SELECT * FROM vw_kpi_totals")

print("Facts & KPIs written:",
      OUT_CAO, OUT_CRED_CONS, OUT_CRED_ANY, OUT_KPI_TOTALS)
