# 01 — Curate currency & domains (staging → curated)

Este cuaderno carga datos desde *staging* (Construction y AnyButConstruction),
aplica conversión de divisa a USD con una tabla de tipos de cambio,
homologa campos y ejecuta validaciones de integridad. Deja como salida
tablas **curadas** en el *database* de destino.

**Entradas esperadas (tablas o vistas de Spark / SQL):**
- `stg_sap.bid_construction_database`
- `stg_sap.bid_project_database` (AnyButConstruction)
- `ref.exchange_rates_atlantica_2024_mod`

**Salidas (tablas Spark / SQL):**
- `cur.bid_construction_curated` (preparada y en USD)
- `cur.bid_anybutconstruction_curated` (preparada y en USD)
- `qa.curate_integrity_issues` (hallazgos de calidad)

In [None]:

# Parameters 
CATALOG = ""       
DB_STAGING = "stg_sap"
DB_REF = "ref"
DB_CUR = "cur"
DB_QA = "qa"

TABLE_CONSTRUCTION = f"{DB_STAGING}.bid_construction_database"
TABLE_ANYBUT = f"{DB_STAGING}.bid_project_database"
TABLE_EXCH = f"{DB_REF}.exchange_rates_atlantica_2024_mod"

OUT_CONSTRUCTION = f"{DB_CUR}.bid_construction_curated"
OUT_ANYBUT = f"{DB_CUR}.bid_anybutconstruction_curated"
OUT_QA = f"{DB_QA}.curate_integrity_issues"


def qual(tbl):
    return f"{CATALOG}.{tbl}" if CATALOG else tbl


## 1) Cargar tablas de *staging* y tipos de cambio

In [None]:

df_construction = spark.table(qual(TABLE_CONSTRUCTION))
df_anybut = spark.table(qual(TABLE_ANYBUT))
df_exch = spark.table(qual(TABLE_EXCH))

# Normalizamos nombres esperados mínimos para el cálculo posterior

from pyspark.sql.functions import col

required_cols_construction = [
    "Project ID","Project phase","Bid Date","Bid Number","Supplier Name",
    "Material Category","Currency","Price","Purchase Order","RtB Budget"
]
required_cols_anybut = [
    "PRQ Code","Bid Date","Bid Number","Supplier Name",
    "Material Category","Currency","Price","PO (Y/N)","Round Number","Winner"
]

missing_c = [c for c in required_cols_construction if c not in df_construction.columns]
missing_a = [c for c in required_cols_anybut if c not in df_anybut.columns]
if missing_c or missing_a:
    raise ValueError(f"Faltan columnas: construction={missing_c}, anybut={missing_a}")


## 2) Diccionario de tipos de cambio → USD

In [None]:

from pyspark.sql.functions import collect_list, struct

# Se espera df_exch con columnas: Currency, AvgRate
if not set(["Currency","AvgRate"]).issubset(set(df_exch.columns)):
    raise ValueError("La tabla de tipos de cambio debe tener columnas: Currency, AvgRate")

# Convertimos a un pequeño diccionario en el driver
pairs = df_exch.select("Currency","AvgRate").collect()
exch_dict = {r["Currency"]: float(r["AvgRate"]) for r in pairs}


## 3) Conversión a USD y estandarización de dominios

In [None]:

from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType, StringType

@udf(DoubleType())
def to_usd(price, currency):
    if currency is None or price is None:
        return None
    rate = exch_dict.get(currency)
    return float(price)*float(rate) if rate is not None else None

@udf(StringType())
def usd_flag(currency):
    return "OK" if currency in exch_dict else "MISSING_RATE"

df_construction_cur = (
    df_construction
    .withColumn("Price_USD", to_usd(col("Price"), col("Currency")))
    .withColumn("USD_Rate_Status", usd_flag(col("Currency")))
)

df_anybut_cur = (
    df_anybut
    .withColumn("Price_USD", to_usd(col("Price"), col("Currency")))
    .withColumn("USD_Rate_Status", usd_flag(col("Currency")))
)


## 4) Validaciones de integridad (calidad)

In [None]:

from pyspark.sql import functions as F

issues = []

# 4.1 Si hay 'After' debe existir 'Before' (por Project ID)
phases = df_construction_cur.select("Project ID","Project phase").dropDuplicates()
proj_phase = phases.groupBy("Project ID").agg(
    F.collect_set("Project phase").alias("phases")
)
after_without_before = proj_phase.where(F.array_contains("phases","After_RtB") & ~F.array_contains("phases","Before_RtB"))
if after_without_before.count() > 0:
    issues.append(after_without_before.withColumn("issue","PHASE_AFTER_WITHOUT_BEFORE"))

# 4.2 Un único Material Category por Project ID
matcat = df_construction_cur.select("Project ID","Material Category").dropDuplicates()
dup_matcat = (matcat.groupBy("Project ID")
              .agg(F.countDistinct("Material Category").alias("n"))
              .where("n>1"))
if dup_matcat.count()>0:
    issues.append(dup_matcat.withColumn("issue","MULTIPLE_MATCAT_PER_PROJECT"))

# 4.3 Un único RtB Budget por Project ID (en registros After_RtB con presupuesto informado)
budget = (df_construction_cur
          .where((F.col("Project phase")=="After_RtB") & F.col("RtB Budget").isNotNull())
          .select("Project ID","RtB Budget").dropDuplicates())
dup_budget = (budget.groupBy("Project ID")
              .agg(F.countDistinct("RtB Budget").alias("n"))
              .where("n>1"))
if dup_budget.count()>0:
    issues.append(dup_budget.withColumn("issue","MULTIPLE_RTB_BUDGET_PER_PROJECT"))

df_issues = None
if issues:
    df_issues = issues[0]
    for extra in issues[1:]:
        df_issues = df_issues.unionByName(extra, allowMissingColumns=True)
else:
    df_issues = spark.createDataFrame([], "ProjectID string, n int, issue string")

df_issues.createOrReplaceTempView("vw_issues")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB_QA}")
spark.sql(f"CREATE TABLE IF NOT EXISTS {OUT_QA} AS SELECT * FROM vw_issues WHERE 1=0")
# Insert (overwrite) hallazgos
spark.sql(f"TRUNCATE TABLE {OUT_QA}")
df_issues.write.mode("append").saveAsTable(OUT_QA)


## 5) Guardar tablas curadas

In [None]:

spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB_CUR}")

df_construction_cur.write.mode("overwrite").saveAsTable(OUT_CONSTRUCTION)
df_anybut_cur.write.mode("overwrite").saveAsTable(OUT_ANYBUT)

print("Curated tables written:", OUT_CONSTRUCTION, OUT_ANYBUT)
print("QA issues table:", OUT_QA)
