### Ingest Drivers file

- Step 1: Read the drivers.json file
- Step 2: Rename columns and add new column
        - a. driverId - driver_id, 
        - b. driverRef - driver_ref, 
        - c. add Ingestion date, 
        - d. concatenate forname and surname
- Step 3: Drop url, name.forename and name.surname fields
- Step 4: write the file into parquet

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DateType

In [0]:
name_schema = StructType(fields=[StructField('forename', StringType(), True),
                                  StructField('surname', StringType(), True)])

In [0]:
drivers_schema = StructType(fields=[
    StructField('driverId', IntegerType(), False),
    StructField('driverRef', StringType(), True),
    StructField('number', IntegerType(), True),
    StructField('code', StringType(), True),
    StructField('name', name_schema),
    StructField('dob', DateType(), True),
    StructField('nationality', StringType(), True),
    StructField('url', StringType(), True)
])

In [0]:
volume_dir = '/Volumes/formula1/default/f1_volume'

In [0]:
driver_df = spark.read \
    .schema(drivers_schema) \
    .json(f'{volume_dir}/raw/drivers.json')

In [0]:
display(driver_df)

In [0]:
from pyspark.sql.functions import current_timestamp, concat, col, lit

In [0]:
driver_renamed_df = driver_df.withColumnRenamed('driverId', 'driver_id') \
                            .withColumnRenamed('driverRef', 'driver_ref') \
                            .withColumn('ingestion_date', current_timestamp()) \
                            .withColumn('name', concat(col('name.forename'), lit(' '), col('name.surname')))
# here we are taking the existing 'name' column and replacing it with in the above way, so no need to explicetly remove the name column

In [0]:
display(driver_renamed_df)

In [0]:
drivers_final_df = driver_renamed_df.drop(col('url'))

In [0]:
display(drivers_final_df)

In [0]:
drivers_final_df.write.mode('overwrite').parquet(f'{volume_dir}/processed/drivers')

In [0]:
display(spark.read.parquet(f'{volume_dir}/processed/drivers'))