# Using widgets

In [0]:
dbutils.widgets.text("data_source_param", "")
data_source_val = dbutils.widgets.get("data_source_param")

# Run helpers notebooks

In [0]:
%run "../helpers/configuration"

In [0]:
%run "../helpers/functions"

# Ingest and process raw data

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Define schema
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("year", IntegerType(), True),
                                     StructField("round", IntegerType(), True),
                                     StructField("circuitId", IntegerType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("date", DateType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("url", StringType(), True)])

In [0]:
# Read races.csv with defined schema
races_df = spark.read.csv(f"{raw_folder_path}/races.csv", header=True, schema=races_schema)
# display(races_df)

In [0]:
# Select all columns except url (can be done with select as well, but then you have to specify all columns)
from pyspark.sql.functions import col

races_filtered = races_df.drop(col("url"))
# display(races_filtered)

In [0]:
from pyspark.sql.functions import col, concat, to_timestamp, lit

# Rename columns as needed

races_processed = races_filtered.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("year", "race_year") \
    .withColumnRenamed("circuitId", "circuit_id") \
    .withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("data_source", lit(data_source_val))
# display(races_processed)

In [0]:
# Add ingestion date to df

from pyspark.sql.functions import col

races_final = add_ingestion_date(races_processed) \
    .drop(col("date")) \
    .drop(col("time"))
# display(races_final)

In [0]:
# Write data to datalake as parquet

races_final.write.parquet(f"{processed_folder_path}/races", mode="overwrite", partitionBy="race_year")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/races"))