In [0]:
# Define schema
from pyspark.sql.types import  StructType, StructField, IntegerType, StringType

schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                            StructField("driverId", IntegerType(), True),
                            StructField("stop", StringType(), True),
                            StructField("lap", IntegerType(), True),
                            StructField("time", StringType(), True),
                            StructField("duration", StringType(), True),
                            StructField("milliseconds", IntegerType(), True)])

In [0]:
# Read json file

pitstops_df = spark.read.json("/mnt/formula1dataplatformdl/raw/pit_stops.json", multiLine=True, schema=schema)
display(pitstops_df)

In [0]:
# Process data
from pyspark.sql.functions import current_timestamp
pitstops_processed = pitstops_df.withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumn("ingestion_date", current_timestamp())

display(pitstops_processed)                    

In [0]:
# Write df into parquet
pitstops_processed.write.parquet("/mnt/formula1dataplatformdl/processed/pit_stops", mode="overwrite")

In [0]:
# Read data to check if it have been properly written
display(spark.read.parquet("/mnt/formula1dataplatformdl/processed/pit_stops"))