In [2]:
from pathlib import Path
from pyspark.sql import functions as F, types as T

RAW_DIR    = Path("/Users/kalebmacedo/formula1-analytics-1/bronze/dados_originais")
SILVER_DIR = Path("./silver"); SILVER_DIR.mkdir(parents=True, exist_ok=True)

TABLE = "seasons.csv"
path = RAW_DIR / TABLE
assert path.exists(), f"Arquivo não encontrado: {path}"

seasons = spark.read.csv(str(path), header=True, inferSchema=True)
seasons.show(5, truncate=False)
seasons.printSchema()


+----+----------------------------------------------------+
|year|url                                                 |
+----+----------------------------------------------------+
|2009|http://en.wikipedia.org/wiki/2009_Formula_One_season|
|2008|http://en.wikipedia.org/wiki/2008_Formula_One_season|
|2007|http://en.wikipedia.org/wiki/2007_Formula_One_season|
|2006|http://en.wikipedia.org/wiki/2006_Formula_One_season|
|2005|http://en.wikipedia.org/wiki/2005_Formula_One_season|
+----+----------------------------------------------------+
only showing top 5 rows

root
 |-- year: integer (nullable = true)
 |-- url: string (nullable = true)



In [9]:
# 2) RE-LEITURA do seasons.csv (1.1 de novo, agora na sessão ATUAL)
from pathlib import Path
from pyspark.sql import functions as F, types as T

RAW_DIR    = Path("/Users/kalebmacedo/formula1-analytics-1/bronze/dados_originais")
SILVER_DIR = Path("./silver"); SILVER_DIR.mkdir(parents=True, exist_ok=True)

TABLE = "seasons.csv"
path = RAW_DIR / TABLE
assert path.exists(), f"Arquivo não encontrado: {path}"

seasons = spark.read.csv(str(path), header=True, inferSchema=True)
seasons.show(3, truncate=False)   # sanity rápido
seasons.printSchema()


+----+----------------------------------------------------+
|year|url                                                 |
+----+----------------------------------------------------+
|2009|http://en.wikipedia.org/wiki/2009_Formula_One_season|
|2008|http://en.wikipedia.org/wiki/2008_Formula_One_season|
|2007|http://en.wikipedia.org/wiki/2007_Formula_One_season|
+----+----------------------------------------------------+
only showing top 3 rows

root
 |-- year: integer (nullable = true)
 |-- url: string (nullable = true)



In [10]:
# 3) Agora sim: 1.2 (nulos e duplicatas) usando o seasons RECÉM-LIDO
total = seasons.count()
print("Total de registros:", total)

miss = [(c, seasons.filter(F.col(c).isNull() | (F.col(c) == "\\N")).count()) for c in seasons.columns]
miss = [(c, m, round(100*m/total,2)) for c,m in miss]
spark.createDataFrame(miss, ["coluna","missing","pct"]).orderBy(F.desc("pct")).show(truncate=False)

dup_rows = seasons.groupBy(seasons.columns).count().filter("count > 1").count()
print("Duplicatas (linhas idênticas):", dup_rows)

dup_pk = seasons.groupBy("year").count().filter("count > 1").count()
print("Duplicatas de PK (year):", dup_pk)


Total de registros: 75
+------+-------+---+
|coluna|missing|pct|
+------+-------+---+
|url   |0      |0.0|
|year  |0      |0.0|
+------+-------+---+

Duplicatas (linhas idênticas): 0
Duplicatas de PK (year): 0


In [11]:
# intervalo de anos
seasons.select(F.min("year").alias("min_year"),
               F.max("year").alias("max_year")).show()

# anos faltando na sequência
miny, maxy = seasons.select(F.min("year"), F.max("year")).first()
years = spark.range(miny, maxy + 1).withColumnRenamed("id","year")
faltando = years.join(seasons.select("year").distinct(), on="year", how="left_anti")
print("Anos ausentes:")
faltando.orderBy("year").show(200, truncate=False)


+--------+--------+
|min_year|max_year|
+--------+--------+
|    1950|    2024|
+--------+--------+

Anos ausentes:
+----+
|year|
+----+
+----+



In [12]:
from pyspark.sql import Window

clean = seasons

# trim strings e cast
for c,t in clean.dtypes:
    if t == "string":
        clean = clean.withColumn(c, F.trim(F.col(c)))
clean = clean.withColumn("year", F.col("year").cast("int"))

# faixa “segura” do dataset
clean = clean.filter((F.col("year") >= 1950) & (F.col("year") <= 2025))

# garantir 1 por ano (se um dia vier duplicado)
w = Window.partitionBy("year").orderBy(F.when(F.col("url").isNull(), 1).otherwise(0))
clean = clean.withColumn("rn", F.row_number().over(w)).filter(F.col("rn")==1).drop("rn")

out = (SILVER_DIR / "seasons_silver.csv").as_posix()
clean.coalesce(1).write.mode("overwrite").option("header", True).csv(out)
print("Silver salva em:", out)


Silver salva em: silver/seasons_silver.csv


In [14]:
from pathlib import Path

# caminho absoluto para sua pasta silver
SILVER_DIR = Path("/Users/kalebmacedo/formula1-analytics-1/silver")

# garante que a pasta existe
SILVER_DIR.mkdir(parents=True, exist_ok=True)

# saída final em formato "um único CSV"
out = SILVER_DIR / "seasons_silver.csv"

# salva consolidado em 1 arquivo
(
    seasons
    .coalesce(1)  # junta tudo em 1 partição
    .write
    .mode("overwrite")
    .option("header", True)
    .csv(str(out))
)

print("Silver salva em:", out)


Silver salva em: /Users/kalebmacedo/formula1-analytics-1/silver/seasons_silver.csv


In [15]:
spark.read.csv(
    "/Users/kalebmacedo/formula1-analytics-1/silver/seasons_silver.csv",
    header=True, inferSchema=True
).show(10, truncate=False)


+----+----------------------------------------------------+
|year|url                                                 |
+----+----------------------------------------------------+
|2009|http://en.wikipedia.org/wiki/2009_Formula_One_season|
|2008|http://en.wikipedia.org/wiki/2008_Formula_One_season|
|2007|http://en.wikipedia.org/wiki/2007_Formula_One_season|
|2006|http://en.wikipedia.org/wiki/2006_Formula_One_season|
|2005|http://en.wikipedia.org/wiki/2005_Formula_One_season|
|2004|http://en.wikipedia.org/wiki/2004_Formula_One_season|
|2003|http://en.wikipedia.org/wiki/2003_Formula_One_season|
|2002|http://en.wikipedia.org/wiki/2002_Formula_One_season|
|2001|http://en.wikipedia.org/wiki/2001_Formula_One_season|
|2000|http://en.wikipedia.org/wiki/2000_Formula_One_season|
+----+----------------------------------------------------+
only showing top 10 rows



In [16]:
from glob import glob
from pathlib import Path
import shutil, os

dir_out = Path("/Users/kalebmacedo/formula1-analytics-1/silver/seasons_silver.csv")
# pega o part "de verdade" (ignora ocultos começando com ponto)
parts = [p for p in glob(str(dir_out / "part-*.csv")) if not os.path.basename(p).startswith(".")]
assert parts, "Nenhum part CSV encontrado."
src_part = parts[0]

dst_file = dir_out.with_suffix("")  # -> /.../silver/seasons_silver
dst_file = Path(str(dst_file) + "_flat.csv")  # seasons_silver_flat.csv
shutil.copy(src_part, dst_file)

print("CSV único salvo em:", dst_file)


CSV único salvo em: /Users/kalebmacedo/formula1-analytics-1/silver/seasons_silver_flat.csv
