### В конце каждого месяца компания выдает премию для своих курьеров, средняя скорость доставки за прошедший месяц которых больше средней скорости среди всех курьеров. Сколько курьеров получили премию за июнь 2021 года?

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, col, round, avg, lit

In [3]:
spark = SparkSession.builder\
    .appName("CourierBonusPySpark")\
    .master("local[*]")\
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")\
    .getOrCreate()

print(f"Spark version: {spark.version}")

Spark version: 4.0.1


#### Step 1. Loading and filtration data

In [4]:
try:
    df = spark.read.parquet("data/couriers_orders.parquet")
except FileNotFoundError as e:
    print(f"{e}")
    spark.stop()

df.printSchema()
df.show(5)

root
 |-- date: timestamp_ntz (nullable = true)
 |-- courier_id: long (nullable = true)
 |-- order_id: long (nullable = true)
 |-- distance: double (nullable = true)
 |-- travel_time: double (nullable = true)

+-------------------+----------+--------+--------+-----------+
|               date|courier_id|order_id|distance|travel_time|
+-------------------+----------+--------+--------+-----------+
|2021-07-12 00:00:00|        10|       1|     1.9|      36.17|
|2021-07-02 00:00:00|         3|       2|    3.98|      21.34|
|2021-04-15 00:00:00|         6|       3|    3.98|      43.33|
|2021-07-16 00:00:00|        10|       4|    2.85|      14.01|
|2021-06-11 00:00:00|        10|       5|    4.89|      32.09|
+-------------------+----------+--------+--------+-----------+
only showing top 5 rows


#### Step 2. June records and new column "speed_km/h"

In [5]:
june_df = df.filter(
    (month(col("date")) == 6) &
    (year(col("date")) == 2021)
)

june_df = june_df.filter(
    (col("distance") > 0) &
    (col("travel_time") > 0)
)

# june_df.show(5)

june_with_speed_df = june_df.withColumn(
    "speed_km/h",
    round(col("distance") / (col("travel_time") / 60), 2)
)

june_with_speed_df.show(5)

+-------------------+----------+--------+--------+-----------+----------+
|               date|courier_id|order_id|distance|travel_time|speed_km/h|
+-------------------+----------+--------+--------+-----------+----------+
|2021-06-11 00:00:00|        10|       5|    4.89|      32.09|      9.14|
|2021-06-14 00:00:00|         4|       9|    4.13|      29.34|      8.45|
|2021-06-27 00:00:00|         8|      10|    1.04|      12.56|      4.97|
|2021-06-27 00:00:00|         1|      19|    1.85|      13.56|      8.19|
|2021-06-28 00:00:00|         2|      25|    4.02|      12.43|      19.4|
+-------------------+----------+--------+--------+-----------+----------+
only showing top 5 rows


#### Step 3. Whole average speed of all couriers

In [6]:
whole_avg_speed_df = june_with_speed_df.agg(
    avg(col("speed_km/h")).alias("whole_avg_speed")
).first()["whole_avg_speed"]

whole_avg_speed_df

6.2771724137930995

#### Step 4. Average speed of each courier

In [7]:
courier_avg_speed_df = june_with_speed_df.groupBy("courier_id").agg(
    round(avg(col("speed_km/h")), 2).alias("courier_avg_speed")
)

courier_avg_speed_df.show()

+----------+-----------------+
|courier_id|courier_avg_speed|
+----------+-----------------+
|         7|              5.8|
|         6|             6.79|
|         9|             4.97|
|         5|             5.33|
|         1|             6.75|
|        10|             6.58|
|         3|             6.22|
|         8|             6.83|
|         2|             6.53|
|         4|             7.69|
+----------+-----------------+



#### Step 5. Results

In [8]:
courier_with_bonus_df = courier_avg_speed_df.filter(
    col("courier_avg_speed") > lit(whole_avg_speed_df)
)

bonus_count = courier_with_bonus_df.count()

print("=" * 40)
print(f"    {bonus_count} couriers with bonus")
print(f"    Their average speed is {whole_avg_speed_df:.2f} km/h")
print(f"    List of these couriers:")
courier_with_bonus_df.orderBy(col("courier_id")).show()
print("=" * 40)

spark.stop()

    6 couriers with bonus
    Their average speed is 6.28 km/h
    List of these couriers:
+----------+-----------------+
|courier_id|courier_avg_speed|
+----------+-----------------+
|         1|             6.75|
|         2|             6.53|
|         4|             7.69|
|         6|             6.79|
|         8|             6.83|
|        10|             6.58|
+----------+-----------------+

