In [0]:
%run "../includes/paths"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_file_date","")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
races_df = spark.read.format("delta").load(f"{processed_container_folder_path}/races")
circuits_df = spark.read.format("delta").load(f"{processed_container_folder_path}/circuits")
drivers_df = spark.read.format("delta").load(f"{processed_container_folder_path}/drivers")
constructors_df = spark.read.format("delta").load(f"{processed_container_folder_path}/constructors")

#filltering results data on file date to fetch only newly added data to support incremental loads as well
results_df = spark.read.format("delta").load(f"{processed_container_folder_path}/results")\
    .filter(f"file_date = '{v_file_date}'") \
    .withColumnRenamed("race_id", "results_race_id")\
    .withColumnRenamed("file_date", "results_file_date")           # renamed the cols to avoid ambiguity       

In [0]:
from pyspark.sql.functions import col
joined_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "inner")\
    .join(results_df, races_df.race_id == results_df.results_race_id, "inner")\
    .join(drivers_df, results_df.driver_id == drivers_df.driver_id, "inner")\
    .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id, "inner")\
    .select(races_df.race_year,\
        races_df.name.alias("race_name"), \
        races_df.date.alias("race_date"),\
        circuits_df.location.alias("circuit_location"),\
        drivers_df.name.alias("driver_name"),\
        drivers_df.number.alias("driver_number"), \
        drivers_df.nationality.alias("driver_nationality"),\
        constructors_df.name.alias("team"),\
        results_df.grid.alias("grid"),\
        results_df.fastest_lap,\
        results_df.time.alias("race_time"),\
        results_df.position,\
        results_df.points,\
        results_df.results_race_id,\
        results_df.results_file_date
        )

In [0]:
from pyspark.sql.functions import current_timestamp
final_df = joined_df.withColumn("created_date", current_timestamp())\
    .withColumnRenamed("results_race_id", "race_id")\
    .withColumnRenamed("results_file_date", "file_date")

In [0]:
# write data to presentation layer

#final_df.write.mode("overwrite").parquet(f"{presentation_container_folder_path}/race_results")

#final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_presentation_db.race_results")

#load_incremental_data(final_df, "race_id","f1_presentation_db", "race_results")

merge_condition = "target_tab.driver_name = source_tab.driver_name AND target_tab.race_id = source_tab.race_id"

merge_incremental_data(final_df, presentation_container_folder_path, merge_condition, "race_id", "f1_presentation_db", "race_results")


In [0]:
%sql
select race_id, count(*) from f1_presentation_db.race_results group by race_id order by race_id desc 