
### Ingest drivers.json file



##### Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
names_schema = StructType(fields=[
    StructField("forename", StringType(), True),
    StructField("surname", StringType(), True)
])

drivers_schema = StructType(fields=[
    StructField("driverId", IntegerType(), False),
    StructField("driverRef", StringType(), True),
    StructField("number", IntegerType(), True),
    StructField("code", StringType(), True),
    StructField("name", names_schema, True),
    StructField("dob", DateType(), True),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

In [0]:
driver_df = spark.read.json("/mnt/jumayelformula1dl/raw/drivers.json", schema=drivers_schema)


#####Step 2 - Rename columns, add ingestion_date column and concatenate the name column with both names

In [0]:
from pyspark.sql.functions import col, lit, concat, current_timestamp

In [0]:
drivers_with_columns_df = driver_df.withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("driverRef", "driver_ref") \
                                   .withColumn("ingestion_date", current_timestamp()) \
                                   .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))



##### Step 3 - Drop the unwanted columns

In [0]:
drivers_final_df = drivers_with_columns_df.drop("url")


##### Step 4 - Write the output to the processed container in parquet format

In [0]:
drivers_final_df.write.parquet("/mnt/jumayelformula1dl/processed/drivers/", mode="overwrite")

In [0]:
display(spark.read.parquet("/mnt/jumayelformula1dl/processed/drivers/"))

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2023-09-23T20:22:29.129+0000
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2023-09-23T20:22:29.129+0000
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2023-09-23T20:22:29.129+0000
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2023-09-23T20:22:29.129+0000
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2023-09-23T20:22:29.129+0000
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,2023-09-23T20:22:29.129+0000
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,2023-09-23T20:22:29.129+0000
8,raikkonen,7.0,RAI,Kimi Räikkönen,1979-10-17,Finnish,2023-09-23T20:22:29.129+0000
9,kubica,88.0,KUB,Robert Kubica,1984-12-07,Polish,2023-09-23T20:22:29.129+0000
10,glock,,GLO,Timo Glock,1982-03-18,German,2023-09-23T20:22:29.129+0000
