In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [0]:
pitstop_schema = StructType(fields=[
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), False),
    StructField('stop', IntegerType(), False),
    StructField('lap', IntegerType(), False),
    StructField('time', StringType(), False),
    StructField('duration', StringType(), True),
    StructField('milliseconds', IntegerType(), True),
])

In [0]:
volume_dir = '/Volumes/formula1/default/f1_volume'

In [0]:
pitstop_raw_df = spark.read \
    .schema(pitstop_schema) \
    .option('multiLine', True) \
    .json(f'{volume_dir}/raw/pit_stops.json')

In [0]:
display(pitstop_raw_df)

#### Rename the columns and add a new column

In [0]:
from pyspark.sql.functions import current_timestamp

#### `withColumnsRenamed` similar to `withColumnRenamed` but can rename multiple column names with single function

In [0]:
pitstop_final_df = pitstop_raw_df.withColumnsRenamed(
    {
        'driverId': 'driver_id',
        'raceId': 'race_id'
    }
    ).withColumn('ingestion_date', current_timestamp())
display(pitstop_final_df)



In [0]:
pitstop_final_df.write.mode('overwrite').parquet(f'{volume_dir}/processed/pit_stops')

In [0]:
# %fs
# ls /mnt/formula19533dl/processed/pit_stops

In [0]:
display(spark.read.parquet(f'{volume_dir}/processed/pit_stops'))