In [4]:
from pathlib import Path
from pyspark.sql import functions as F, types as T

RAW_DIR = Path("/Users/kalebmacedo/formula1-analytics-1/bronze/dados_originais")
SILVER_ROOT = Path("/Users/kalebmacedo/formula1-analytics-1/silver")
SILVER_ROOT.mkdir(parents=True, exist_ok=True)

TABLE = "constructors.csv"
path = RAW_DIR / TABLE
assert path.exists(), f"Arquivo não encontrado: {path}"

cons = spark.read.csv(str(path), header=True, inferSchema=True)
cons.show(5, truncate=False)
cons.printSchema()


+-------------+--------------+----------+-----------+------------------------------------------------------------+
|constructorId|constructorRef|name      |nationality|url                                                         |
+-------------+--------------+----------+-----------+------------------------------------------------------------+
|1            |mclaren       |McLaren   |British    |http://en.wikipedia.org/wiki/McLaren                        |
|2            |bmw_sauber    |BMW Sauber|German     |http://en.wikipedia.org/wiki/BMW_Sauber                     |
|3            |williams      |Williams  |British    |http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering|
|4            |renault       |Renault   |French     |http://en.wikipedia.org/wiki/Renault_in_Formula_One         |
|5            |toro_rosso    |Toro Rosso|Italian    |http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso            |
+-------------+--------------+----------+-----------+---------------------------

In [5]:
total = cons.count()
print("Total de registros:", total)

miss = [(c, cons.filter(F.col(c).isNull() | (F.col(c) == "\\N")).count()) for c in cons.columns]
miss = [(c, m, round(100*m/total,2)) for c,m in miss]
spark.createDataFrame(miss, ["coluna","missing","pct"]).orderBy(F.desc("pct")).show(truncate=False)

dup_rows = cons.groupBy(cons.columns).count().filter("count > 1").count()
print("Duplicatas (linhas idênticas):", dup_rows)

dup_pk = cons.groupBy("constructorId").count().filter("count > 1").count()
print("Duplicatas de PK (constructorId):", dup_pk)


Total de registros: 212
+--------------+-------+---+
|coluna        |missing|pct|
+--------------+-------+---+
|nationality   |0      |0.0|
|constructorId |0      |0.0|
|name          |0      |0.0|
|url           |0      |0.0|
|constructorRef|0      |0.0|
+--------------+-------+---+

Duplicatas (linhas idênticas): 0
Duplicatas de PK (constructorId): 0


In [6]:
if "nationality" in cons.columns:
    cons.groupBy("nationality").count().orderBy(F.desc("count")).show(20, truncate=False)

if "constructorRef" in cons.columns and "name" in cons.columns:
    from pyspark.sql.functions import lower, regexp_replace
    slug_from_name = lower(regexp_replace(F.col("name"), r"[^a-zA-Z0-9]+", "-"))
    diffs = cons.withColumn("slug_from_name", slug_from_name)\
                .filter(F.col("constructorRef") != F.col("slug_from_name"))
    print("Diferenças entre constructorRef e slug derivado do name:")
    diffs.select("constructorId","constructorRef","slug_from_name","name").show(20, truncate=False)


+-------------+-----+
|nationality  |count|
+-------------+-----+
|British      |86   |
|American     |39   |
|Italian      |30   |
|French       |13   |
|German       |10   |
|Swiss        |5    |
|Japanese     |5    |
|South African|3    |
|Dutch        |3    |
|Russian      |2    |
|Malaysian    |2    |
|Canadian     |2    |
|Mexican      |1    |
|Rhodesian    |1    |
|Indian       |1    |
|Irish        |1    |
|Australian   |1    |
|Spanish      |1    |
|Austrian     |1    |
|New Zealander|1    |
+-------------+-----+
only showing top 20 rows

Diferenças entre constructorRef e slug derivado do name:
+-------------+--------------+-----------------+-----------------+
|constructorId|constructorRef|slug_from_name   |name             |
+-------------+--------------+-----------------+-----------------+
|2            |bmw_sauber    |bmw-sauber       |BMW Sauber       |
|5            |toro_rosso    |toro-rosso       |Toro Rosso       |
|8            |super_aguri   |super-aguri      |Super 

In [8]:
from pyspark.sql import Window
import os, glob, shutil

SILVER_DIR = SILVER_ROOT / "constructors"
SILVER_DIR.mkdir(parents=True, exist_ok=True)

clean = cons

for c,t in clean.dtypes:
    if t == "string":
        clean = clean.withColumn(c, F.when(F.col(c)=="\\N", None).otherwise(F.col(c)))
        clean = clean.withColumn(c, F.trim(F.col(c)))

if "constructorRef" in clean.columns:
    clean = clean.withColumn("constructorRef",
                             F.lower(F.regexp_replace(F.col("constructorRef"), r"\s+", "")))
if "name" in clean.columns:
    clean = clean.withColumn("name", F.initcap(F.col("name")))

clean = clean.withColumn("constructorId", F.col("constructorId").cast("int"))
w = Window.partitionBy("constructorId").orderBy(F.when(F.col("name").isNull(), 1).otherwise(0))
clean = clean.withColumn("rn", F.row_number().over(w)).filter(F.col("rn")==1).drop("rn")

out_dir = (SILVER_DIR / "constructors_silver.csv").as_posix()
clean.coalesce(1).write.mode("overwrite").option("header", True).csv(out_dir)
print("Silver (Spark) criada em:", out_dir)

parts = [p for p in glob.glob(os.path.join(out_dir, "part-*.csv")) if not os.path.basename(p).startswith(".")]
if parts:
    flat_dst = SILVER_DIR / "constructors_silver_flat.csv"
    shutil.copy(parts[0], flat_dst)
    print("CSV único salvo em:", flat_dst)
else:
    print("Não encontrei part-*.csv dentro de", out_dir)


Silver (Spark) criada em: /Users/kalebmacedo/formula1-analytics-1/silver/constructors/constructors_silver.csv
CSV único salvo em: /Users/kalebmacedo/formula1-analytics-1/silver/constructors/constructors_silver_flat.csv


In [9]:
spark.read.csv(
    "/Users/kalebmacedo/formula1-analytics-1/silver/constructors/constructors_silver.csv",
    header=True, inferSchema=True
).show(10, truncate=False)


+-------------+--------------+-----------+-----------+------------------------------------------------------------+
|constructorId|constructorRef|name       |nationality|url                                                         |
+-------------+--------------+-----------+-----------+------------------------------------------------------------+
|1            |mclaren       |Mclaren    |British    |http://en.wikipedia.org/wiki/McLaren                        |
|2            |bmw_sauber    |Bmw Sauber |German     |http://en.wikipedia.org/wiki/BMW_Sauber                     |
|3            |williams      |Williams   |British    |http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering|
|4            |renault       |Renault    |French     |http://en.wikipedia.org/wiki/Renault_in_Formula_One         |
|5            |toro_rosso    |Toro Rosso |Italian    |http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso            |
|6            |ferrari       |Ferrari    |Italian    |http://en.wikipedi