<a href="https://colab.research.google.com/github/LucasMirandaVS/estudos_python/blob/main/Data_Wrangling_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#0. Dependencies

In [33]:
pip install pyspark==3.5.1



In [34]:
import glob, os
from google.colab import files
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [35]:
spark = SparkSession.builder.appName("ace-tariffs-cleaning").getOrCreate()

#1. Defining the Data

In [37]:
path = "/content/raw_data.csv/raw_data.csv"
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv(path))

In [38]:
COL_ENTRY = "Entry Summary Number"
COL_TYPE  = "Entry Type Code"
TARIFF_COLS = [
    "Line Tariff Duty Amount",
    "Line MPF Amount",
    "Line HMF Amount",
    "Antidumping Duty Amount",
    "Countervailing Duty Amount",
]

In [40]:
def to_double(colname: str):
    s = F.col(colname).cast("string")
    s = F.regexp_replace(s, r"\s+", "")
    s = F.regexp_replace(s, r"\$", "")
    s = F.regexp_replace(s, r",", "")
    s = F.regexp_replace(s, r"^\((.*)\)$", r"-\1")
    return s.cast("double")

# 2. Adding new column

In [41]:
df = df.withColumn(
    "entry_type_key",
    F.concat_ws("-", F.col(COL_ENTRY).cast("string"), F.col(COL_TYPE).cast("string"))
)

# 3. Aggregating Tariff Columns

In [42]:
agg_exprs = {c: F.sum(to_double(c)).alias(c) for c in TARIFF_COLS}

# 4. Cleaning Uncessary Records

In [43]:
other_cols = [c for c in df.columns if c not in (TARIFF_COLS + [COL_ENTRY, COL_TYPE, "entry_type_key"])]

final = (
    df.groupBy("entry_type_key")
      .agg(
          F.first(F.col(COL_ENTRY), ignorenulls=True).alias(COL_ENTRY),
          F.first(F.col(COL_TYPE),  ignorenulls=True).alias(COL_TYPE),
          *[agg_exprs[c] for c in TARIFF_COLS],
          *[F.first(F.col(c), ignorenulls=True).alias(c) for c in other_cols]
      )
)

# 5. Exportinng the Final Table

In [44]:
final.select(
    "entry_type_key",
    COL_ENTRY,
    COL_TYPE,
    *TARIFF_COLS,
    *other_cols
).show(20, truncate=False)

+--------------+--------------------+---------------+-----------------------+---------------+---------------+-----------------------+--------------------------+---------------+------------------+----------+------------------+---------------------------------+-------------------------+----------------------+----------------------+---------------+-------------------+-------------+---------------------+-----------------+------------------------------------------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------------+
|entry_type_key|Entry Summary Number|Entry Type Code|Line Tariff Duty Amount|Line MPF Amount|Line HMF Amount|Antidumping Duty Amount|Countervailing Duty Amount|Importer Number|Port of Entry Code|Entry Date|Entry Summary Date|Initial Entry Summary Create Date|Entry Summary Line Number|Country of Origin Code|Country of Export Code|Manuf

In [45]:
final.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/ace_final")