# Using widgets

In [0]:
dbutils.widgets.text("data_source_param", "")
data_source_val = dbutils.widgets.get("data_source_param")

# Run helpers notebooks

In [0]:
%run "../helpers/configuration"

In [0]:
%run "../helpers/functions"

# Ingest and process raw data

In [0]:
# Define schema
from pyspark.sql.types import  StructType, StructField, IntegerType, StringType

schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                            StructField("raceId", IntegerType(), True),
                            StructField("driverId", IntegerType(), True),
                            StructField("constructorId", IntegerType(), True),
                            StructField("number", IntegerType(), True),
                            StructField("position", IntegerType(), True),
                            StructField("q1", StringType(), True),
                            StructField("q2", StringType(), True),
                            StructField("q3", StringType(), True)])

In [0]:
# Read json files (json arrays)

qualifying_df = spark.read.json(f"{raw_folder_path}/qualifying/qualifying_split*.json", multiLine=True, schema=schema)
# display(qualifying_df)

In [0]:
from pyspark.sql.functions import lit

# Process data
qualifying_processed = qualifying_df.withColumnRenamed("qualifyId", "qualifying_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("constructorId", "constructor_id") \
    .withColumn("data_source", lit(data_source_val))

# display(qualifying_processed)                    

In [0]:
# Add ingestion date
qualifying_final = add_ingestion_date(qualifying_processed)

In [0]:
# Write df into parquet
qualifying_final.write.parquet(f"{processed_folder_path}/qualifying", mode="overwrite")

In [0]:
# Read data to check if it have been properly written
display(spark.read.parquet(f"{processed_folder_path}/qualifying"))