<a href="https://colab.research.google.com/github/LucasMirandaVS/estudos_python/blob/main/Data_Wrangling_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#0. Dependencies

In [93]:
pip install pyspark==3.5.1



In [94]:
import glob, os
from google.colab import files
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [95]:
spark = SparkSession.builder.appName("ace-tariffs-cleaning").getOrCreate()

#1. Defining the Data

In [96]:
path = "/content/raw_data.csv/raw_data.csv"
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv(path))

In [99]:
DROP_COLS = [
    "Line Tariff Quantity (2)",
    "Line Tariff UOM (2) Code",
    "Line Tariff Quantity (3)",
    "Line Tariff UOM (3) Code",
    "Line Tariff Quantity",
    "Line Tariff UOM (1) Code",
    "Line Tariff Goods Value Amount",
    "Line Tariff Quantity (1)"
]

In [100]:
def to_double(colname: str):
    s = F.col(colname).cast("string")
    s = F.regexp_replace(s, r"\s+", "")
    s = F.regexp_replace(s, r"\$", "")
    s = F.regexp_replace(s, r",", "")
    s = F.regexp_replace(s, r"^\((.*)\)$", r"-\1")
    s = F.regexp_replace(s, r"^-$", "0")
    return s.cast("double")

# 2. Adding new column

In [102]:
df = df.withColumn(
    "entry_type_key",
    F.concat_ws("-", F.col(COL_ENTRY).cast("string"), F.col(COL_TYPE).cast("string"))
)

# 3. Aggregating Tariff Columns

In [103]:
agg_exprs = {c: F.sum(to_double(c)).alias(c + " (Aggregated)") for c in TARIFF_COLS}

# 4. Cleaning Uncessary Records

In [105]:
other_cols = [c for c in df.columns if c not in (TARIFF_COLS + [COL_ENTRY, COL_TYPE, "entry_type_key"])]

final = (
    df.groupBy("entry_type_key")
      .agg(
          F.first(F.col(COL_ENTRY), ignorenulls=True).alias(COL_ENTRY),
          F.first(F.col(COL_TYPE),  ignorenulls=True).alias(COL_TYPE),
          *[agg_exprs[c] for c in TARIFF_COLS],
          *[F.first(F.col(c), ignorenulls=True).alias(c) for c in other_cols]
      )
)

In [107]:
final = final.drop(*[c for c in DROP_COLS if c in final.columns])

In [108]:
final.show(20, truncate=False)

+--------------+--------------------+---------------+------------------------------------+----------------------------+----------------------------+------------------------------------+---------------------------------------+---------------+------------------+----------+------------------+---------------------------------+-------------------------+----------------------+----------------------+---------------+-------------------+-------------+---------------------+-----------------+------------------------------------------------------------+
|entry_type_key|Entry Summary Number|Entry Type Code|Line Tariff Duty Amount (Aggregated)|Line MPF Amount (Aggregated)|Line HMF Amount (Aggregated)|Antidumping Duty Amount (Aggregated)|Countervailing Duty Amount (Aggregated)|Importer Number|Port of Entry Code|Entry Date|Entry Summary Date|Initial Entry Summary Create Date|Entry Summary Line Number|Country of Origin Code|Country of Export Code|Manufacturer ID|Foreign Exporter ID|Line SPI Code|Tariff

# 5. Exportinng the Final Table

In [109]:
out_dir = "/content/ace_final"

In [110]:
final.coalesce(1).write.mode("overwrite").option("header", True).csv(out_dir)
print("Resultado salvo em:", out_dir)

Resultado salvo em: /content/ace_final
