In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/paths"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import *
#creating schema
races_schema = StructType([StructField("raceId", IntegerType()), 
                           StructField("year", IntegerType()), 
                           StructField("round", IntegerType()), 
                           StructField("circuitId", IntegerType()), 
                           StructField("name", StringType()), 
                           StructField("date", StringType()), 
                           StructField("time", StringType()),
                           StructField("url", StringType())])

In [0]:
races_df = spark.read.format("csv").option("header", "true").schema(races_schema).load(f"{raw_container_folder_path}/{v_file_date}/races.csv")

In [0]:
from pyspark.sql.functions import col
races_df = races_df.drop(col("url"))

In [0]:
races_renamed_df = races_df.withColumnRenamed("circuitId", "circuit_id")\
.withColumnRenamed("year", "race_year")\
.withColumnRenamed("raceId", "race_id")

In [0]:
from pyspark.sql.functions import to_timestamp, current_timestamp, lit, concat, col

races_final_df = races_renamed_df.withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss"))\
    .withColumn("ingestion_date", current_timestamp())\
    .withColumn("data_source", lit(v_data_source))\
    .withColumn("file_date", lit(v_file_date))

#### to handle cases where time is null and date has valid data. if we do not do this it will display null in the race_timestamp column
races_final_df = races_renamed_df.withColumn(
    "race_timestamp", 
    to_timestamp(
        concat(col("date"), lit(" "), 
               when(col("time").isNull(), lit("00:00:00")).otherwise(col("time"))),
        "yyyy-MM-dd HH:mm:ss"))

In [0]:
#races_final_df.write.mode("overwrite").partitionBy("race_year").parquet(f"{processed_container_folder_path}/races")

races_final_df.write.mode("overwrite").partitionBy("race_year").format("delta").saveAsTable("f1_processed_db.races")

In [0]:
dbutils.notebook.exit("success")