###Ingest lap_times.json file

####Step 1 - Read the CSV file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType

In [0]:
lap_times_schema = StructType(fields=[StructField("raceId",IntegerType(), False),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("lap",IntegerType(), True),
                                 StructField("position",IntegerType(), True),
                                 StructField("time",StringType(), True),
                                 StructField("milliseconds",IntegerType(), True)
])


In [0]:
lap_times_df = spark.read \
.schema(lap_times_schema)\
.csv("/mnt/formula1courseds/raw/lap_times")

In [0]:
display(lap_times_df)

In [0]:
lap_times_df.count()

#### Step 2 - Rename columns and add new columns
1. Rename raceId to race_id and driverId to driver_id
2. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
lap_time_final_df = lap_times_df.withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumn("ingestion_date", current_timestamp())

In [0]:
display(lap_time_final_df)

#### Step 3 - Write to output to processed container in parquet format

In [0]:
lap_time_final_df.write.mode("overwrite").parquet("/mnt/formula1courseds/processed/lap_times")

In [0]:
#display(spark.read.parquet("/mnt/formula1courseds/processed/lap_times"))