In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
volume_dir = '/Volumes/formula1/default/f1_volume'

In [0]:
# raceId year round	circuitId	name	date	time
race_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                 StructField("year", IntegerType(), True),
                                 StructField("round", IntegerType(), True),
                                 StructField("circuitId", IntegerType(), False),
                                 StructField("name", StringType(), True),
                                 StructField("date", DateType(), True),
                                 StructField("time", StringType(), True),
                                 StructField("url", StringType(), False)])

In [0]:
race_df = spark.read \
.option("header", True) \
    .schema(race_schema) \
    .csv(f'{volume_dir}/raw/races.csv')

In [0]:
display(race_df)

In [0]:
race_df.printSchema()

In [0]:
race_renamed_df = race_df.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("year", "race_year") \
    .withColumnRenamed("circuitId", "circuit_id") \



In [0]:
display(race_renamed_df)

In [0]:
from pyspark.sql.functions import col, lit, trim, regexp_replace, concat, current_timestamp, when, to_timestamp

In [0]:
race_trans_df = race_renamed_df \
    .withColumn(
        "clean_time",
        when(col("time") == "\\N", lit("00:00:00"))
        .otherwise(trim(col("time")))
    ) \
    .withColumn(
        "race_timestamp",
        to_timestamp(
            concat(col("date"), lit(" "), col("clean_time")),
            "yyyy-MM-dd HH:mm:ss"
        )
    ) \
    .withColumn("ingestion_date", current_timestamp())

display(race_trans_df)


In [0]:
race_selected_df = race_trans_df.select("race_id", "race_year", "round", "circuit_id", "name", "race_timestamp", "ingestion_date")

In [0]:
display(race_selected_df)

In [0]:
race_selected_df.write.parquet(f'{volume_dir}/processed/races', mode='overwrite')

In [0]:
%fs
ls /Volumes/formula1/default/f1_volume/processed/races