# Using widgets

In [0]:
dbutils.widgets.text("data_source_param", "")
data_source_val = dbutils.widgets.get("data_source_param")

# Run helpers notebooks

In [0]:
%run "../helpers/configuration"

In [0]:
%run "../helpers/functions"

# Ingest and process raw data

In [0]:
# Define schema
from pyspark.sql.types import  StructType, StructField, IntegerType, StringType

schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                            StructField("driverId", IntegerType(), True),
                            StructField("lap", IntegerType(), True),
                            StructField("position", IntegerType(), True),
                            StructField("time", StringType(), True),
                            StructField("milliseconds", IntegerType(), True)])

In [0]:
# Read csv files

laptimes_df = spark.read.csv(f"{raw_folder_path}/lap_times", schema=schema)
# display(laptimes_df)

In [0]:
from pyspark.sql.functions import lit

# Process data
laptimes_processed = laptimes_df.withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumn("data_source", lit(data_source_val))

# display(laptimes_processed)                    

In [0]:
# Add ingestion date
laptimes_final = add_ingestion_date(laptimes_processed)

In [0]:
# Write df into parquet
laptimes_final.write.parquet(f"{processed_folder_path}/lap_times", mode="overwrite")

In [0]:
# Read data to check if it have been properly written
display(spark.read.parquet(f"{processed_folder_path}/lap_times"))