In [0]:
# Define schema
from pyspark.sql.types import  StructType, StructField, IntegerType, StringType

schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                            StructField("raceId", IntegerType(), True),
                            StructField("driverId", IntegerType(), True),
                            StructField("constructorId", IntegerType(), True),
                            StructField("number", IntegerType(), True),
                            StructField("position", IntegerType(), True),
                            StructField("q1", StringType(), True),
                            StructField("q2", StringType(), True),
                            StructField("q3", StringType(), True)])

In [0]:
# Read json files (json arrays)

qualifying_df = spark.read.json("/mnt/formula1dataplatformdl/raw/qualifying/qualifying_split*.json", multiLine=True, schema=schema)
display(qualifying_df)

In [0]:
# Process data
from pyspark.sql.functions import current_timestamp
qualifying_processed = qualifying_df.withColumnRenamed("qualifyId", "qualifying_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("constructorId", "constructor_id") \
    .withColumn("ingestion_date", current_timestamp())

display(qualifying_processed)                    

In [0]:
# Write df into parquet
qualifying_processed.write.parquet("/mnt/formula1dataplatformdl/processed/qualifying", mode="overwrite")

In [0]:
# Read data to check if it have been properly written
display(spark.read.parquet("/mnt/formula1dataplatformdl/processed/qualifying"))