###Ingest qualifying_split*.json file

####Step 1 - Read multiple JSON file using the spark dataframe reader 

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId",IntegerType(), False),
                                 StructField("raceId",IntegerType(), True),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("constructorId",IntegerType(), True),
                                 StructField("number",IntegerType(), True),
                                 StructField("position",IntegerType(), True),
                                 StructField("q1",StringType(), True),
                                 StructField("q2",StringType(), True),
                                 StructField("q3",StringType(), True)
])


In [0]:
qualfying_df = spark.read \
    .schema(qualifying_schema) \
    .option("multiline", True) \
    .json(f"{raw_folder_path}/qualifying")

In [0]:
qualfying_df.printSchema()

In [0]:
display(qualfying_df)

#### Step 2 - Rename columns and add new columns
1. qualifyId renamed to qualify_id
2. raceID renamed to race_id
3. driverId renamed to driver_id
4. constructorId renamed to constructor_id
5. ingestion date added


In [0]:
qualifying_final_df = add_ingestion_date(qualfying_df) \
                                   .withColumnRenamed("qualifyId", "qualify_id") \
                                   .withColumnRenamed("raceID", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("constructorId", "constructor_id") \
                                   .withColumn("data_source", lit(v_data_source))     

In [0]:
display(qualifying_final_df)

#### Step 3 - Write to output to processed container in parquet format

In [0]:
qualifying_final_df.write.mode("overwrite").parquet("f{processed_folder_path}/qualifying")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/qualifying"))