In [0]:
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.functions import current_date, to_date, col, when, lit
import pyspark.sql.functions as F
import pyspark.sql.types as T
from datetime import datetime, timedelta
import datetime as dt
import json
import sys

dbutils.widgets.removeAll()

dbutils.widgets.text("brewdat_library_version", "v1.1.5", "01 - brewdat_library_version")
brewdat_library_version = dbutils.widgets.get("brewdat_library_version")
print(f"{brewdat_library_version = }")

dbutils.widgets.text("target_database", "gld_maz_logistics_warehouse", "02 - target_database")
target_database = dbutils.widgets.get("target_database")
print(f"{target_database = }")

dbutils.widgets.text("target_table", f"maz_copec_marc_lvc", "03 - target_table")
target_table = dbutils.widgets.get("target_table")
print(f"{target_table = }")

dbutils.widgets.text("target_zone", "maz", "04 - target_zone")
target_zone = dbutils.widgets.get("target_zone")
print(f"{target_zone = }")

dbutils.widgets.text("target_business_domain", "logistics", "05 - target_business_domain")
target_business_domain = dbutils.widgets.get("target_business_domain")
print(f"{target_business_domain = }")

dbutils.widgets.text("target_subzone", f"copecac", "06 - target_subzone")
target_subzone = dbutils.widgets.get("target_subzone")
print(f"{target_subzone = }")

dbutils.widgets.text("target_data_product", "item", "07 - target_product")
target_product = dbutils.widgets.get("target_data_product")
print(f"{target_product = }")

date_filter_end = dt.datetime.today().date()
date_filter_start = dt.datetime.today().date() - dt.timedelta(days=1)
print("date_filter_end = '{}'".format(date_filter_end))
print("date_filter_start = '{}'".format(date_filter_start))

dbutils.widgets.text("data_interval_end", "{}".format(date_filter_end), "09 - data_interval_end")
data_interval_end = dbutils.widgets.get("data_interval_end")
print("data_interval_end = '{}'".format(data_interval_end))

dbutils.widgets.text("data_interval_start", "{}".format(date_filter_start), "08 - data_interval_start")
data_interval_start = dbutils.widgets.get("data_interval_start")
print("data_interval_start = '{}'".format(data_interval_start))

dbutils.widgets.text("partition_date_format", "yyyy", "10 - partition_date_format")
partition_date_format = "yyyy"
partition_date_format = dbutils.widgets.get("partition_date_format")
print("partition_date_format = '{}'".format(partition_date_format))

dbutils.widgets.text("partition_columns", "stock_insertion_date", "11 - partition_column")
partition_column = dbutils.widgets.get("partition_columns")
print("partition_columns = '{}'".format(partition_column))

dbutils.widgets.text("mchb", "","12 - mchb")
mchb = dbutils.widgets.get("mchb")
print(f"{mchb = }")

dbutils.widgets.text("mard", "","13 - mard")
mard = dbutils.widgets.get("mard")
print(f"{mard = }")

delta_column_name = 'target_apply_dt'
print(f"{delta_column_name = }")

In [0]:
sys.path.append(f"/Workspace/Repos/brewdat_library/{brewdat_library_version}")
from brewdat.data_engineering import common_utils, lakehouse_utils, transform_utils, write_utils
common_utils.set_global_dbutils(dbutils)

In [0]:
%run "../set_project_context"

In [0]:
import os
environment = os.getenv("ENVIRONMENT")
if environment not in ["dev", "qa", "prod"]:
    raise Exception(
        "This Databricks Workspace does not have necessary environment variables."
        " Contact the admin team to set up the global init script and restart your cluster."
    )

if environment == 'dev':
    src_uc = 'brewdat_uc_maz_dev'
elif environment == 'qa':
    src_uc = 'brewdat_uc_maz_qa'
elif environment == 'prod':
    src_uc = 'brewdat_uc_maz_prod'

if target_subzone == 'copecac':
    schema_md = 'slv_maz_masterdata_sap_pr3'
    schema_sp = 'slv_maz_supply_sap_pr3'
    schema_log = 'brz_maz_logistics_sap_pr3'
elif target_subzone == 'mx':
    schema_md = 'slv_maz_masterdata_sap_pr0'
    schema_sp = 'slv_maz_supply_sap_pr0'


print(f"{schema_md=}\n{schema_sp=}\n{src_uc=}")

In [0]:
common_utils.configure_spn_access_for_adls(
    storage_account_names=[adls_silver_gold_storage_account_name],
    key_vault_name=key_vault_name,
    spn_client_id=spn_client_id,
    spn_secret_name=spn_secret_name
)

In [0]:
spark.conf.set("spark.databricks.adaptive.autoOptimizeShuffle.minPartitionNumber", 10000)
spark.conf.set("spark.databricks.adaptive.autoOptimizeShuffle.enabled", True )
spark.conf.set("spark.databricks.adaptive.skewJoin.spillProof.enabled", True)

#### Creating logic table

In [0]:
# copecac_marc = (
#         spark.read.table(f"{src_uc}.{schema_log}.{target_subzone}_marc")
#         .select("matnr", "werks", "trame", "bwesb", "umlmc", "glgmg", "lvorm")
#         .filter(~F.col("op_ind").contains("D"))
#         .alias("copecac_marc")
#     )

In [0]:
# ============================================
# =================marc=======================
# ============================================

# 1. Cargar la tabla origen
df = spark.read.table(f"brewdat_uc_maz_prod.brz_maz_masterdata_sap_pr3.copecac_marc").select("matnr", "werks", "trame", "bwesb", "umlmc", "glgmg", "lvorm", "op_ind","source_commit_dt","source_commit_ts","__insert_gmt_ts")

# 2. Definir la ventana ROW_NUMBER()
w = Window.partitionBy(
        "werks",
        "matnr",
        "source_commit_dt"
    ).orderBy(
        F.col("source_commit_ts").desc(),
        F.col("__insert_gmt_ts").desc()
    )

# 3. Agregar la columna row_number (equivalente al ROW_NUMBER() SQL)
df_with_rn = df.withColumn("last_version", F.row_number().over(w))

# 4. Aplicar QUALIFY last_version = 1  → en PySpark es un filter()
df_filtered = df_with_rn.filter(F.col("last_version") == 1)

# 5. Orden final (ORDER BY)
marc_last_version_commit = df_filtered #.orderBy(F.col("__insert_gmt_ts").desc())
marc_last_version_commit.cache()
marc_last_version_commit.count()

In [0]:
marc_last_version_commit=marc_last_version_commit.withColumn("__partition_column", F.date_format(current_date(),partition_date_format).cast(T.IntegerType()))

In [0]:
# # =========================================================
# # 1. GENERAR LISTA "dates" DESDE 2025-10-01 HASTA HOY
# # =========================================================

# start_date = datetime.strptime("2025-10-01", "%Y-%m-%d")
# end_date = datetime.now()

# dates = []
# current = start_date
# while current <= end_date:
#     dates.append(current.strftime("%Y-%m-%d"))
#     current += timedelta(days=1)

# print("Total dates generated:", len(dates))


# # =========================================================
# # 2. CARGAR LA TABLA maz_stock_h_plus Y OBTENER insertion_date
# # =========================================================

# df_stock = spark.table(f"{src_uc}.gld_maz_logistics_warehouse.maz_stock_h_plus")

# # convertir columna a tipo DATE si no lo está
# df_stock = df_stock.withColumn("insertion_date", to_date(col("insertion_date")))

# dates_in_table = [row["insertion_date"].strftime("%Y-%m-%d") for row in df_stock.select("insertion_date").distinct().collect()]

# # Encontrar missing dates
# missing_dates = [d for d in dates if d not in dates_in_table]

# print("Missing dates:", missing_dates)


# # =========================================================
# # 3. ITERAR SOBRE MISSING_DATES
# # =========================================================
# # Reemplaza estas variables por tus DataFrames reales:
# # df_mchb, df_mard → para fechas antiguas
# # copecac_mchb_current_date, copecac_mard_current_date → para la fecha actual

# today_str = datetime.now().strftime("%Y-%m-%d")

# row = Row(
#         matnr="",
#         werks="",
#         lgort="",
#         lvorm="",
#         labst="",
#         speme="",
#         insme="",
#         retme="",
#         einme="",
#         op_ind="",
#         __partition_column="",
#         history_date=""        
#     )

# mard_h = spark.createDataFrame([row])

# for date in missing_dates:

#     # MARD
#     w_mard = Window.partitionBy(
#         "werks",
#         "matnr",
#         "lgort"
#     ).orderBy(
#         F.col("source_commit_ts").desc(),
#         F.col("__insert_gmt_ts").desc()
#     )

#     df_mard = (
#         mard_last_version_commit\
#         .filter(F.col("source_commit_dt") <= date)\
#         .withColumn("unique_version", F.row_number().over(w_mard))\
#         .filter(F.col("unique_version")==1)\
#         .drop("unique_version", "last_version")
#     )
#     df_mard.cache()
#     # mard_temp="df_mard_temp"

#     if date < today_str:
#         mard = df_mard
#     else:
#         mard = copecac_mard_current_date

#     print(f"Ejecutando para fecha {date}...")

#     # Si es una función Python directamente:
#     mard_h = mard_h.unionByName(
#                             mard\
#                             .withColumn("history_date", lit(date))\
#                             .withColumn("__partition_column", F.date_format(current_date(),partition_date_format).cast(T.IntegerType()))\
#                             )



In [0]:
final_df = marc_last_version_commit

In [0]:
try:
    audit_df = transform_utils.create_or_replace_audit_columns(final_df)
    # audit_df = audit_df.drop(F.col("op_ind"))
    
except Exception:
    common_utils.exit_with_last_exception()

In [0]:
audit_df

In [0]:
# Sets location for gold folder --revisar
params_list = [lakehouse_gold_root, target_zone, target_business_domain,
                   target_subzone,target_product, target_table]

if any(x is None or len(x) == 0 for x in params_list):
        raise ValueError("Location would contain null or empty values.")

lakehouse_utils.assert_valid_zone(target_zone)
lakehouse_utils.assert_valid_business_domain(target_business_domain)
lakehouse_utils.assert_valid_folder_name(target_table)
target_location = (f"{lakehouse_gold_root}/data/{target_zone}/{target_business_domain}/" +
            f"gld_{target_zone}_{target_business_domain}_warehouse/{target_table}").lower()

print(f"{target_location = }")
print(f"{target_table = }")
print(f"{lakehouse_gold_root = }")

In [0]:
# auditZeros = audit_df.filter((F.col("unrestricted") + F.col("blocked") + F.col("in_quality_inspection") + F.col("returns") + F.col("restricted_use")) <= 0)
# audit_df = audit_df.subtract(auditZeros)

In [0]:
if target_subzone == 'mx':
    load_type = 'OVERWRITE_TABLE'
if target_subzone == 'copecac':
    load_type = 'APPEND_ALL'

In [0]:
results = write_utils.write_delta_table(
    df=audit_df,
    location=target_location,
    database_name=target_database,
    # table_name=target_table,
    table_name='OVERWRITE_TABLE',
    load_type=load_type,
    key_columns=["werks","matnr"],
    partition_columns=["__partition_column"],
    schema_evolution_mode=write_utils.SchemaEvolutionMode.ADD_NEW_COLUMNS,
    # schema_evolution_mode=write_utils.SchemaEvolutionMode.OVERWRITE_SCHEMA,
)
print(results)

In [0]:
common_utils.exit_with_object(results)