<a href="https://colab.research.google.com/github/LucasMirandaVS/estudos_python/blob/main/Data_Wrangling_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [4]:
pip install pyspark==3.5.1



In [19]:
import glob, os
from google.colab import files
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F

In [6]:
spark = SparkSession.builder.appName("ace-tariffs-cleaning").getOrCreate()

# Defining the Data

In [7]:
up = files.upload('raw_data.csv')
csv_path = next(iter(up.keys()))
print("Arquivo:", csv_path)

Saving raw_data.csv to raw_data.csv/raw_data.csv
Arquivo: raw_data.csv/raw_data.csv


In [9]:
# Defining the DF
def read_csv_with_fallback(path):
    try:
        return (spark.read
                .option("header", True)
                .option("inferSchema", True)
                .option("sep", ",")
                .option("quote", '"')
                .option("escape", '"')
                .csv(path))
    except Exception:
        # tenta ";"
        return (spark.read
                .option("header", True)
                .option("inferSchema", True)
                .option("sep", ";")
                .option("quote", '"')
                .option("escape", '"')
                .csv(path))

df_raw = read_csv_with_fallback(csv_path)
print("Linhas:", df_raw.count()); df_raw.printSchema()

Linhas: 73334
root
 |-- Entry Summary Number: string (nullable = true)
 |-- Entry Type Code: integer (nullable = true)
 |-- Importer Number: string (nullable = true)
 |-- Port of Entry Code: integer (nullable = true)
 |-- Entry Date: string (nullable = true)
 |-- Entry Summary Date: string (nullable = true)
 |-- Initial Entry Summary Create Date: string (nullable = true)
 |-- Entry Summary Line Number: integer (nullable = true)
 |-- Country of Origin Code: string (nullable = true)
 |-- Country of Export Code: string (nullable = true)
 |-- Manufacturer ID: string (nullable = true)
 |-- Foreign Exporter ID: string (nullable = true)
 |-- Line SPI Code: string (nullable = true)
 |-- Tariff Ordinal Number: integer (nullable = true)
 |-- HTS Number - Full: long (nullable = true)
 |-- HTS Long Description: string (nullable = true)
 |-- Line Tariff Quantity (1): string (nullable = true)
 |-- Line Tariff UOM (1) Code: string (nullable = true)
 |-- Line Tariff Quantity (2): string (nullable = tr

In [11]:
# normalizing table names
def norm(c):
    return (c.strip()
              .lower()
              .replace("\n"," ")
              .replace("\t"," ")
              .replace("  "," ")
              .replace(" ", "_")
              .replace("-", "_")
              .replace("/", "_")
              .replace(".", "")
           )

df = df_raw.toDF(*[norm(c) for c in df_raw.columns])

In [12]:
# detecting fields
def find_col(candidates):
    cols = df.columns
    for cand in candidates:
        for c in cols:
            if cand in c:
                return c
    return None

In [14]:
col_hts = find_col(["hts", "hts_number", "harmonized", "htsnum"])
col_line_tariff = find_col(["line_tariff_duty", "line_tariff", "tariff_duty", "duty_amount"])
col_antidumping = find_col(["antidumping", "anti_dumping"])
col_countervail = find_col(["countervailing", "counter_vailing", "cvd"])

print("Detected ->",
      "\n HTS:", col_hts,
      "\n Line Tariff Duty:", col_line_tariff,
      "\n Anti-dumping:", col_antidumping,
      "\n Countervailing:", col_countervail)

Detected -> 
 HTS: hts_number___full 
 Line Tariff Duty: line_tariff_duty_amount 
 Anti-dumping: antidumping_duty_amount 
 Countervailing: countervailing_duty_amount


In [15]:
# Cleaning the numbers
num_cols = [c for c in [col_line_tariff, col_antidumping, col_countervail] if c]
for c in num_cols:
    df = (df.withColumn(c, F.regexp_replace(F.col(c), r"\.", ""))
            .withColumn(c, F.regexp_replace(F.col(c), r",", "."))
            .withColumn(c, F.col(c).cast("double")))

In [16]:
if col_hts:
    df = df.withColumn(col_hts, F.trim(F.col(col_hts)).cast("string"))

In [17]:
# Filter out invalid lines
if not col_hts:
    raise ValueError("Não encontrei a coluna HTS. Ajuste a detecção no passo 3.")

agg_exprs = []
if col_line_tariff: agg_exprs.append(F.sum(F.col(col_line_tariff)).alias("Total Line Tariff Duty Amount"))
if col_antidumping: agg_exprs.append(F.sum(F.col(col_antidumping)).alias("Total Antidumping Duty Amount"))
if col_countervail: agg_exprs.append(F.sum(F.col(col_countervail)).alias("Total Countervailing Duty Amount"))

final = (
    df.filter(F.col(col_hts).isNotNull() & (F.length(F.col(col_hts)) > 0))
      .groupBy(F.col(col_hts).alias("HTS Number"))
      .agg(*agg_exprs)
)

final.show(20, truncate=False)
final.printSchema()

+----------+-----------------------------+-----------------------------+--------------------------------+
|HTS Number|Total Line Tariff Duty Amount|Total Antidumping Duty Amount|Total Countervailing Duty Amount|
+----------+-----------------------------+-----------------------------+--------------------------------+
|4011801010|NULL                         |NULL                         |NULL                            |
|99038803  |NULL                         |NULL                         |NULL                            |
|4013901000|NULL                         |NULL                         |NULL                            |
|99030125  |NULL                         |NULL                         |NULL                            |
|4011201005|NULL                         |NULL                         |NULL                            |
|4002190016|NULL                         |NULL                         |NULL                            |
|4011902050|NULL                         |NULL

In [18]:
out_dir = "/content/ace_cleaned"
final.coalesce(1).write.mode("overwrite").option("header", True).csv(out_dir)
print("CSV escrito em:", out_dir)

CSV escrito em: /content/ace_cleaned


In [21]:
parts = glob.glob(os.path.join(out_dir, "part-*.csv"))
if parts:
    from google.colab import files
    files.download(parts[0])
else:
    print("FILE NOT FOUND")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>