###Ingest pitstop.json file

####Step 1 - Read the JSON file using the spark dataframe reader 

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId",IntegerType(), False),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("stop",StringType(), True),
                                 StructField("lap",IntegerType(), True),
                                 StructField("time",StringType(), True),
                                 StructField("duration",StringType(), True),
                                 StructField("milliseconds",IntegerType(), True)
])


In [0]:
pit_stops_df = spark.read \
.schema(pit_stops_schema)\
.option("multiline", True)\
.json(""{raw_folder_path}/pit_stops.json")

In [0]:
display(pit_stops_df)

#### Step 2 - Rename columns and add new columns
1. Rename raceId to race_id
2. Rename driverId to driver_id
2. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
pit_stops_with_columns_df = pit_stops_df.withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                    add_ingestion_date((results_df))

In [0]:
display(pit_stops_with_columns_df)

#### Step 3 - Drop unwanted columns

In [0]:
pit_stops_final_df = pit_stops_with_columns_df.drop("url")

In [0]:
display(pit_stops_final_df)

#### Step 4 - Write to output to processed container in parquet format

In [0]:
pit_stops_final_df.write.mode("overwrite").parquet("f{processed_folder_path}/pit_stops")

In [0]:
#display(spark.read.parquet(""f{processed_folder_path}/pit_stops"))