# Join Ingested data into single table

In [0]:
from pyspark.sql.functions import col, current_timestamp

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
# We are renaming columns here so that we dont get conflicts when we merge tables that have columns of the same name
# (Spark doesnt have the _suffixes that pandas does)

drivers = spark.read.parquet(f"{processed_folder_path}/drivers") \
.withColumnRenamed("number", "driver_number") \
.withColumnRenamed("name", "driver_name") \
.withColumnRenamed("nationality", "driver_nationality")   

constructors = spark.read.parquet(f"{processed_folder_path}/constructors") \
.withColumnRenamed("name", "team")
 
circuits = spark.read.parquet(f"{processed_folder_path}/circuits") \
.withColumnRenamed("location", "circuit_location")

races = spark.read.parquet(f"{processed_folder_path}/races") \
.withColumnRenamed("name", "race_name") \
.withColumnRenamed("race_timestamp", "race_date")

results = spark.read.parquet(f"{processed_folder_path}/results") \
.withColumnRenamed("time","race_time")

In [0]:
race_circuits = races.join(circuits, races.circuit_id==circuits.circuit_id) \
.select(races.race_id, races.race_year, races.race_name,races.race_date, circuits.circuit_location)

In [0]:
race_results = results.join(race_circuits, results.race_id==race_circuits.race_id) \
.join(drivers, results.driver_id == drivers.driver_id) \
.join(constructors, results.constructor_id == constructors.constructor_id)

In [0]:
final = race_results.select(
    "race_year", 
    "race_name", 
    "race_date", 
    "circuit_location", 
    "driver_name", 
    "driver_number", 
    "driver_nationality", 
    "team", "fastest_lap", 
    "race_time", 
    "points",
    "position",
    ) \
.withColumn("created_date", current_timestamp())


In [0]:
final = final.orderBy(final.points.desc())

In [0]:
final.write.mode("overwrite").parquet(f"{presentation_folder_path}/race_results")