###Ingest results.json file

####Step 1 - Read the JSON file using the spark dataframe reader 

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType

In [0]:
result_schema = StructType(fields=[StructField("resultId",IntegerType(), False),
                                 StructField("raceId",IntegerType(), True),
                                 StructField("driverId",IntegerType(), True),
                                 StructField("constructorId",IntegerType(), True),
                                 StructField("number",IntegerType(), True),
                                 StructField("grid",IntegerType(), True),
                                 StructField("position",IntegerType(), True),
                                 StructField("positionText",StringType(), True),
                                 StructField("positionOrder",IntegerType(), True),
                                 StructField("points",FloatType(), True),
                                 StructField("laps",IntegerType(), True),
                                 StructField("time",StringType(), True),
                                 StructField("milliseconds",IntegerType(), True),
                                 StructField("fastestLap",IntegerType(), True),
                                 StructField("rank",IntegerType(), True),
                                 StructField("fastestLapTime",StringType(), True),
                                 StructField("fastestLapSpeed",StringType(), True),
                                 StructField("statusId",IntegerType(), True)
])

In [0]:
results_df = spark.read \
.schema(result_schema)\
.json(f"{raw_folder_path}/results.json")

In [0]:
results_df.printSchema()

In [0]:
display(results_df)

#### Step 2 - Rename columns and add new columns
1. resultId renamed to result_id
2. raceId renamed to race_id
3. driverId renamed to driver_id
4. constructorId renamed to constructor_id
5. positionText renamed to position_text
6. positionOrder renamed to position_order
7. fastestLap renamed to fastest_lap
8. fastestLapTime renamed to fastest_lap_time
9. fastestLapSpeed renamed to fastest_lap_speed
10. add ingestion date column

In [0]:
results_with_columns_df = add_ingestion_date(results_df) \
                                   .withColumnRenamed("resultId", "result_id") \
                                   .withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("constructorId", "constructor_id") \
                                   .withColumnRenamed("positionText", "position_text") \
                                   .withColumnRenamed("positionOrder", "position_order") \
                                   .withColumnRenamed("fastestLap", "fastest_lap") \
                                   .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
                                   .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
                                   .withColumn("data_source", lit(v_data_source)) 



In [0]:
display(results_with_columns_df)

#### Step 4 - Write to output to processed container in parquet format

In [0]:
results_with_columns_df.write.mode("overwrite").partitionBy("race_id").parquet("f{processed_folder_path}/results")

In [0]:
#display(spark.read.parquet("f"{processed_folder_path}/results"))