<a href="https://colab.research.google.com/github/LucasMirandaVS/estudos_python/blob/main/Data_Manip_Spark_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#0. Dependencies

In [1]:
pip install pyspark



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

#1. Dataframe Config

In [17]:
spark = SparkSession.builder.appName("ACE Report Transformation").getOrCreate()

# ===== STEP 1 — Load data (CSV recommended, as Spark does not natively read Excel) =====
INPUT_PATH  = "/content/raw_data_test_version.csv"
OUTPUT_PATH = "ace_report_enriched_sparke.csv"

COL_ENTRY_NUM = "Entry Summary Number"
COL_LINE_NUM  = "Entry Summary Line Number"

TARIFF_COLS = [
    "Line Tariff Duty Amount",
    "Line MPF Amount",
    "Line HMF Amount",
    "Antidumping Duty Amount",
    "Countervailing Duty Amount",
]

DROP_EXTRA_COLS = [
    "Line Tariff Goods Value Amount",
    "Line Tariff Quantity (1)",
    "Line Tariff UOM (1) Code",
    "Line Tariff Quantity (2)",
    "Line Tariff UOM (2) Code",
    "Line Tariff Quantity (3)",
    "Line Tariff UOM (3) Code",
]

# Read CSV directly into Spark DataFrame
raw_df = spark.read.option("header", True).option("inferSchema", True).csv(INPUT_PATH)

# ===== STEP 2 — Create "Entry summary number code" (clean suffix .0) =====
def rm_dotzero(colname: str):
    return F.regexp_replace(F.trim(F.col(colname).cast("string")), r"\\.0$", "")

transformed_df = raw_df.withColumn(
    "Entry summary number code",
    F.concat_ws("-", rm_dotzero(COL_ENTRY_NUM), rm_dotzero(COL_LINE_NUM))
)

# Cast tariff columns to double for aggregation
for c in TARIFF_COLS:
    if c in transformed_df.columns:
        transformed_df = transformed_df.withColumn(c, F.col(c).cast("double"))

# Cast line number column to numeric for ordering
if COL_LINE_NUM in transformed_df.columns:
    transformed_df = transformed_df.withColumn(COL_LINE_NUM, F.col(COL_LINE_NUM).cast("double"))

# ===== STEP 3 — Group: sum tariff columns by code =====
agg_df = transformed_df.groupBy("Entry summary number code").agg(
    *[F.sum(F.col(c)).alias(f"{c}__agg") for c in TARIFF_COLS]
)

# ===== STEP 4 — Join totals back =====
df = transformed_df.join(agg_df, on="Entry summary number code", how="left")

# ===== STEP 5 — Keep totals only on the LAST row per code =====
w = Window.partitionBy("Entry summary number code").orderBy(F.col(COL_LINE_NUM).asc())
is_last = F.lead(F.col("Entry summary number code")).over(w).isNull()

for c in TARIFF_COLS:
    df = df.withColumn(
        f"Aggregated {c}",
        F.when(is_last, F.col(f"{c}__agg")).otherwise(F.lit(0.0))
    )

# ===== STEP 6 — Drop original tariff, __agg, and extra columns =====
cols_to_drop = [c for c in TARIFF_COLS if c in df.columns] \
             + [f"{c}__agg" for c in TARIFF_COLS if f"{c}__agg" in df.columns] \
             + [c for c in DROP_EXTRA_COLS if c in df.columns]

for c in cols_to_drop:
    df = df.drop(c)

# ===== STEP 7 — Remove rows where all aggregated values are zero =====
aggregated_cols = [f"Aggregated {c}" for c in TARIFF_COLS if f"Aggregated {c}" in df.columns]
if aggregated_cols:
    non_zero_cond = None
    for c in aggregated_cols:
        cond = (F.col(c) != 0)
        non_zero_cond = cond if non_zero_cond is None else (non_zero_cond | cond)
    df = df.filter(non_zero_cond)


#2. Export

In [18]:
# ===== STEP 8 — Write output =====
# Write single CSV (coalesce to 1 for convenience; remove if large scale)
df.coalesce(1).write.mode("overwrite").option("header", True).csv(OUTPUT_PATH)
print(f"Wrote: {OUTPUT_PATH}")

Wrote: ace_report_enriched_sparke.csv
