In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, udf
from pyspark.sql import DataFrame, Column
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T

cols_rename = [
    "filial", "cod_prod", 
    "periodo", "etiqueta", 
    "perc_dsc_cupom", "venda", 
    "venda_desconto"
]

cols_cosmos = [
    "MVVC_CD_FILIAL_MOV",
    "MVVP_NR_PRD",
    "MVVC_DT_MOV",
    "NUMERO_AUTORIZ_PAGUEMENOS",
    "MVVP_PR_DSC_ITE",
    "MVVP_VL_PRE_VDA",
    "MVVP_VL_PRD_VEN",
]

cols_pre_venda = [
    "VC_CD_FILIAL",
    "VD_CD_PRODUTO",
    "VC_DH_VENDA",
    "VD_COD_ETIQUETA_ULCH",
    "VD_PERC_DESCONTO",
    "VD_VL_PRODUTO",
    "VD_VL_PRODUTO_COM_DESCONTO",
]

In [0]:
def table_exists(
    catalog: str, 
    schema: str, 
    tablename: str
) -> bool:
    
    return spark.catalog.tableExists(f"{catalog}.{schema}.{tablename}")

def etiqueta(colname: str) -> Column:
    return F.lpad(F.trim(colname), 30, "0").cast(T.StringType())

In [0]:
def view_cupom(
    path: str, 
    year: int, 
    month: int, 
    columns: list[str]
) -> DataFrame:
    
    col_etiqueta = columns[3]

    return (
        spark.read.parquet(f"/Volumes/raw/super_desconto/{path}/{year}/{month:02d}/")
        .select(columns)
        .withColumn(col_etiqueta, etiqueta(col_etiqueta))
        .withColumnsRenamed(dict(zip(columns, cols_rename)))
    )

windows = Window.partitionBy("etiqueta").orderBy(col("venda_desconto").desc())

tables = (
    view_cupom("COSMOSMOV", 2023, 1, cols_cosmos)
    .union(view_cupom("PRE_VENDA", 2023, 1, cols_pre_venda))
    .withColumn("id", F.row_number().over(windows))
    .filter(col("id") == 1)
    .drop("id")
)

tables.write.format("delta").mode("overwrite").saveAsTable("bronze.super_desconto.venda")

In [0]:
%sql

select sum(venda_desconto) from bronze.super_desconto.venda