In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
# raceId year round	circuitId	name	date	time
race_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                 StructField("year", IntegerType(), True),
                                 StructField("round", IntegerType(), True),
                                 StructField("circuitId", IntegerType(), False),
                                 StructField("name", StringType(), True),
                                 StructField("date", DateType(), True),
                                 StructField("time", StringType(), True),
                                 StructField("url", StringType(), False)])

In [0]:
race_df = spark.read \
.option("header", True) \
    .schema(race_schema) \
    .csv('/mnt/formula19533dl/raw/races.csv')

In [0]:
display(race_df)

In [0]:
race_df.printSchema()

In [0]:
race_renamed_df = race_df.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("year", "race_year") \
    .withColumnRenamed("circuitId", "circuit_id") \



In [0]:
display(race_renamed_df)

In [0]:
from pyspark.sql.functions import to_timestamp, concat, lit, current_timestamp, col

In [0]:
race_trans_df = race_renamed_df \
    .withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(' '), col("time")), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("ingestion_date", current_timestamp())

display(race_trans_df)

In [0]:
race_selected_df = race_trans_df.select("race_id", "race_year", "round", "circuit_id", "name", "race_timestamp", "ingestion_date")

In [0]:
display(race_selected_df)

In [0]:
race_selected_df.write.parquet('/mnt/formula19533dl/processed/races', mode='overwrite')

In [0]:
%fs
ls /mnt/formula19533dl/processed/races