In [0]:
%run "../includes/configuration"

### Built-in aggregate functions

In [0]:
data = spark.read.parquet(f"{presentation_folder_path}/race_results").filter("race_year = 2020")

display(data)

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,fastest_lap,race_time,points,created_date
2020,Hungarian Grand Prix,2020-07-19,Budapest,Lewis Hamilton,44,British,Mercedes,70.0,1:36:12.473,26.0,2023-08-05T05:13:08.813+0000
2020,Tuscan Grand Prix,2020-09-13,Mugello,Lewis Hamilton,44,British,Mercedes,58.0,2:19:35.060,26.0,2023-08-05T05:13:08.813+0000
2020,Russian Grand Prix,2020-09-27,Sochi,Valtteri Bottas,77,Finnish,Mercedes,51.0,1:34:00.364,26.0,2023-08-05T05:13:08.813+0000
2020,Portuguese Grand Prix,2020-10-25,Portimão,Lewis Hamilton,44,British,Mercedes,63.0,1:29:56.828,26.0,2023-08-05T05:13:08.813+0000
2020,Emilia Romagna Grand Prix,2020-11-01,Imola,Lewis Hamilton,44,British,Mercedes,63.0,1:28:32.430,26.0,2023-08-05T05:13:08.813+0000
2020,Austrian Grand Prix,2020-07-05,Spielburg,Valtteri Bottas,77,Finnish,Mercedes,68.0,1:30:55.739,25.0,2023-08-05T05:13:08.813+0000
2020,Styrian Grand Prix,2020-07-12,Spielburg,Lewis Hamilton,44,British,Mercedes,68.0,1:22:50.683,25.0,2023-08-05T05:13:08.813+0000
2020,British Grand Prix,2020-08-02,Silverstone,Lewis Hamilton,44,British,Mercedes,45.0,1:28:01.283,25.0,2023-08-05T05:13:08.813+0000
2020,70th Anniversary Grand Prix,2020-08-09,Silverstone,Max Verstappen,33,Dutch,Red Bull,46.0,1:19:41.993,25.0,2023-08-05T05:13:08.813+0000
2020,Spanish Grand Prix,2020-08-16,Montmeló,Lewis Hamilton,44,British,Mercedes,63.0,1:31:45.279,25.0,2023-08-05T05:13:08.813+0000


In [0]:
from pyspark.sql.functions import count, countDistinct, sum

data.select(count("*")).show()

+--------+
|count(1)|
+--------+
|     340|
+--------+



In [0]:
data.select(countDistinct("race_name")).show()

+-------------------------+
|count(DISTINCT race_name)|
+-------------------------+
|                       17|
+-------------------------+



In [0]:
data.select(sum("points")).show()

+-----------+
|sum(points)|
+-----------+
|     1734.0|
+-----------+



In [0]:
data.filter("driver_name = 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name")) \
.withColumnRenamed("sum(points)", "total_points") \
.withColumnRenamed("count(DISTINCT race_name)", "number_of_races") \
.show() 

+------------+---------------+
|total_points|number_of_races|
+------------+---------------+
|       347.0|             16|
+------------+---------------+



### GroupBy

In [0]:
data.groupBy("driver_name").sum('points').show()

+------------------+-----------+
|       driver_name|sum(points)|
+------------------+-----------+
|       Jack Aitken|        0.0|
|      Daniil Kvyat|       32.0|
|   Kevin Magnussen|        1.0|
|      Sergio Pérez|      125.0|
|      Carlos Sainz|      105.0|
|    Kimi Räikkönen|        4.0|
|   Romain Grosjean|        2.0|
|   Charles Leclerc|       98.0|
|   Alexander Albon|      105.0|
|      Lance Stroll|       75.0|
|      Pierre Gasly|       75.0|
|    Lewis Hamilton|      347.0|
|   Nico Hülkenberg|       10.0|
|  Daniel Ricciardo|      119.0|
|   Valtteri Bottas|      223.0|
|Antonio Giovinazzi|        4.0|
|      Lando Norris|       97.0|
|    Max Verstappen|      214.0|
|    George Russell|        3.0|
|  Sebastian Vettel|       33.0|
+------------------+-----------+
only showing top 20 rows



In [0]:
data.groupBy("driver_name") \
.agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
.show()

+------------------+------------+---------------+
|       driver_name|total_points|number_of_races|
+------------------+------------+---------------+
|       Jack Aitken|         0.0|              1|
|      Daniil Kvyat|        32.0|             17|
|   Kevin Magnussen|         1.0|             17|
|      Sergio Pérez|       125.0|             15|
|      Carlos Sainz|       105.0|             17|
|    Kimi Räikkönen|         4.0|             17|
|   Romain Grosjean|         2.0|             15|
|   Charles Leclerc|        98.0|             17|
|   Alexander Albon|       105.0|             17|
|      Lance Stroll|        75.0|             16|
|      Pierre Gasly|        75.0|             17|
|    Lewis Hamilton|       347.0|             16|
|   Nico Hülkenberg|        10.0|              3|
|  Daniel Ricciardo|       119.0|             17|
|   Valtteri Bottas|       223.0|             17|
|Antonio Giovinazzi|         4.0|             17|
|      Lando Norris|        97.0|             17|


## Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

In [0]:
df = spark.read.parquet(f"{presentation_folder_path}/race_results")

demo_df = df.filter("race_year in (2019, 2020)")

In [0]:
demo_grouped_df = demo_df \
.groupBy("race_year", "driver_name") \
.agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races"))

In [0]:
driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))
demo_grouped_df = demo_grouped_df.withColumn("rank", rank().over(driverRankSpec))

In [0]:
display(demo_grouped_df)

race_year,driver_name,total_points,number_of_races,rank
2019,Lewis Hamilton,413.0,21,1
2019,Valtteri Bottas,326.0,21,2
2019,Max Verstappen,278.0,21,3
2019,Charles Leclerc,264.0,21,4
2019,Sebastian Vettel,240.0,21,5
2019,Carlos Sainz,96.0,21,6
2019,Pierre Gasly,95.0,21,7
2019,Alexander Albon,92.0,21,8
2019,Daniel Ricciardo,54.0,21,9
2019,Sergio Pérez,52.0,21,10
