###Ingest lap_times.json file

####Step 1 - Read the CSV file using the spark dataframe reader API

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
lap_times_schema = StructType(fields=[StructField("raceId",IntegerType(), False),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("lap",IntegerType(), True),
                                 StructField("position",IntegerType(), True),
                                 StructField("time",StringType(), True),
                                 StructField("milliseconds",IntegerType(), True)
])


In [0]:
lap_times_df = spark.read \
.schema(lap_times_schema)\
.csv(f"{raw_folder_path}/{v_file_date}/lap_times")

In [0]:
#display(lap_times_df)

In [0]:
#lap_times_df.count()

#### Step 2 - Rename columns and add new columns
1. Rename raceId to race_id and driverId to driver_id
2. Add ingestion_date with current timestamp

In [0]:
lap_time_final_df =  add_ingestion_date(lap_times_df) \
                                   .withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumn("data_source", lit(v_data_source)) \
                                   .withColumn("file_date", lit(v_file_date))

In [0]:
#display(lap_time_final_df)

#### Step 3 - Write to output to processed container in parquet format

In [0]:
overwrite_partition(lap_time_final_df, "f1_processed", "lap_times", "race_id")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/lap_times"))

In [0]:
dbutils.notebook.exit("Success")