In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, avg, count, collect_list, collect_set
spark = SparkSession.builder.appName("AggregationsTask").getOrCreate()
matches_df = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/matches.csv")
deliveries_df = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/deliveries.csv")
teams_each_year = matches_df.select("season", col("team1").alias("team")) \
    .union(matches_df.select("season", col("team2").alias("team"))) \
    .groupBy("season") \
    .agg(countDistinct("team").alias("number_of_teams"))

print("1. Number of teams each year:")
teams_each_year.show()
matches_result = matches_df.groupBy("result").count()
print("2. Number of matches abandoned, tied, and completed:")
matches_result.show()
balls_per_match_inning = deliveries_df.groupBy("match_id", "inning").count()
avg_balls = balls_per_match_inning.groupBy("inning").agg(avg("count").alias("avg_balls_per_match"))
print("3. Average number of balls per match per inning:")
avg_balls.show()
team_wins_per_year = matches_df.groupBy("season", "winner").count().orderBy("season", col("count").desc())
print("4. Number of matches each team has won in each year:")
team_wins_per_year.show()
batsmen_collect_list = deliveries_df.groupBy("match_id", "batting_team").agg(collect_list("batter").alias("batsmen_list"))
batsmen_collect_set = deliveries_df.groupBy("match_id", "batting_team").agg(collect_set("batter").alias("batsmen_set"))

print("5. Batsmen list using collect_list:")
batsmen_collect_list.show(truncate=False)

print("5. Batsmen set using collect_set:")
batsmen_collect_set.show(truncate=False)

1. Number of teams each year:
+-------+---------------+
| season|number_of_teams|
+-------+---------------+
|2009/10|              8|
|   2016|              8|
|   2012|              9|
|   2019|              8|
|   2017|              8|
|   2014|              8|
|   2013|              9|
|2020/21|              8|
|   2018|              8|
|   2009|              8|
|   2011|             10|
|   2022|             10|
|2007/08|              8|
|   2023|             10|
|   2021|              8|
|   2024|             10|
|   2015|              8|
+-------+---------------+

2. Number of matches abandoned, tied, and completed:
+---------+-----+
|   result|count|
+---------+-----+
|  wickets|  578|
|     runs|  498|
|      tie|   14|
|no result|    5|
+---------+-----+

3. Average number of balls per match per inning:
+------+-------------------+
|inning|avg_balls_per_match|
+------+-------------------+
|     1| 123.30410958904109|
|     6|                4.0|
|     3|                5.5|
| 