In [0]:
%run ../includes/common_functions

In [0]:
%run ../includes/configuration

**Read all necessary files from ADL-container into dataframes**

In [0]:
results_df = spark.read.parquet(f"{processed_folder_path}/results") \
.withColumnRenamed("time", "race_time")

In [0]:
races_df = spark.read.parquet(f"{processed_folder_path}/races") \
.withColumnRenamed("year", "race_year") \
.withColumnRenamed("name", "race_name") \
.withColumnRenamed("date", "race_date")

circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits") \
.withColumnRenamed("location", "circuit_location")

In [0]:
constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors") \
.withColumnRenamed("nationality", "team")

In [0]:
drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers") \
.withColumnRenamed("nationality", "driver_nationality") \
.withColumnRenamed("name", "driver_name") \
.withColumnRenamed("number", "driver_number")

**Join into single dataframe with selected columns**

In [0]:
join_df = races_df.join(circuits_df, on=circuits_df.circuit_id == races_df.circuit_id) \
.join(results_df, on=results_df.race_id == races_df.race_id) \
.join(drivers_df, on=drivers_df.driver_id == results_df.driver_id) \
.join(constructors_df, on=constructors_df.constructor_id == results_df.constructor_id) \
.select(
  races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location, drivers_df.driver_name, drivers_df.driver_nationality, drivers_df.driver_number, constructors_df.team, results_df.grid, results_df.fastest_lap, results_df.race_time, results_df.points) \
.withColumn("created_date", current_timestamp())

In [0]:
display(join_df)

race_year,race_name,race_date,circuit_location,driver_name,driver_nationality,driver_number,team,grid,fastest_lap,race_time,points,created_date
2018,Australian Grand Prix,2018-03-25,Melbourne,Sergey Sirotkin,Russian,35,British,19,3.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Marcus Ericsson,Swedish,9,Swiss,17,4.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Pierre Gasly,French,10,Italian,20,13.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Kevin Magnussen,Danish,20,American,5,21.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Romain Grosjean,French,8,American,6,23.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Brendon Hartley,New Zealander,28,Italian,16,57.0,\N,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Lance Stroll,Canadian,18,British,13,55.0,+1:18.288,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Charles Leclerc,Monegasque,16,Swiss,18,56.0,+1:15.759,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Esteban Ocon,French,31,Indian,14,57.0,+1:00.278,0.0,2021-09-07T09:15:02.824+0000
2018,Australian Grand Prix,2018-03-25,Melbourne,Sergio Pérez,Mexican,11,Indian,12,51.0,+46.817,0.0,2021-09-07T09:15:02.824+0000


In [0]:
display(join_df.filter("race_year == 2020 and race_name == 'Abu Dhabi Grand Prix'").orderBy(join_df.points.desc()))

race_year,race_name,race_date,circuit_location,driver_name,driver_nationality,driver_number,team,grid,fastest_lap,race_time,points,created_date
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Max Verstappen,Dutch,33,Austrian,1,14,1:36:28.645,25.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Valtteri Bottas,Finnish,77,German,2,40,+15.976,18.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lewis Hamilton,British,44,German,3,37,+18.415,15.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Alexander Albon,Thai,23,Austrian,5,42,+19.987,12.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lando Norris,British,4,British,4,53,+1:00.729,10.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Carlos Sainz,Spanish,55,British,6,48,+1:05.662,8.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Daniel Ricciardo,Australian,3,French,11,55,+1:13.748,7.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Pierre Gasly,French,10,Italian,9,53,+1:29.718,4.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Esteban Ocon,French,31,French,10,47,+1:41.069,2.0,2021-09-07T09:35:06.682+0000
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lance Stroll,Canadian,18,British,8,41,+1:42.738,1.0,2021-09-07T09:35:06.682+0000


In [0]:
join_df.write.mode("overwrite").partitionBy("race_year").parquet("/mnt/formuladlgen/presentation/race_results")