###Ingest pitstop.json file

####Step 1 - Read the JSON file using the spark dataframe reader 

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId",IntegerType(), False),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("stop",StringType(), True),
                                 StructField("lap",IntegerType(), True),
                                 StructField("time",StringType(), True),
                                 StructField("duration",StringType(), True),
                                 StructField("milliseconds",IntegerType(), True)
])


In [0]:
pit_stops_df = spark.read \
.schema(pit_stops_schema)\
.option("multiline", True)\
.json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
display(pit_stops_df)

#### Step 2 - Rename columns and add new columns
1. Rename raceId to race_id
2. Rename driverId to driver_id
2. Add ingestion_date with current timestamp

In [0]:
pit_stops_with_columns_df = add_ingestion_date((pit_stops_df)) \
                                   .withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumn("data_source", lit(v_data_source)) \
                                   .withColumn("file_date", lit(v_file_date))

In [0]:
#display(pit_stops_with_columns_df)

#### Step 3 - Write to output to processed container in parquet format

In [0]:
overwrite_partition(pit_stops_with_columns_df, "f1_processed", "pit_stops", "race_id")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/pit_stops"))

In [0]:
dbutils.notebook.exit("Success")