In [0]:
ipl_matches_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/ipl_2021_matches.csv")


In [0]:
from pyspark.sql.functions import col

teams_each_year = ipl_matches_df.select("year", "match_team1", "match_team2") \
    .withColumn("team", col("match_team1")) \
    .union(
        ipl_matches_df.select("year", "match_team1", "match_team2")
        .withColumn("team", col("match_team2"))
    ) \
    .select("year", "team") \
    .distinct() \
    .groupBy("year").count().withColumnRenamed("count", "num_teams")

teams_each_year.show()


+----+---------+
|year|num_teams|
+----+---------+
|2021|        1|
+----+---------+



In [0]:
ipl_matches_df.columns


Out[4]: ['year',
 'series_type',
 'series_name',
 'match_no',
 'match_type',
 'match_name',
 'match_href',
 'match_team1',
 'match_team2',
 'match_datetime_start',
 'match_date_end',
 'match_venue']

In [0]:
ipl_matches_df.groupBy("match_type").count().show()


+----------+-----+
|match_type|count|
+----------+-----+
|    League|   62|
+----------+-----+



In [0]:
from pyspark.sql import Row
sample_data = [
    Row(match_id="M1", inning=1, ball_no=1.1),
    Row(match_id="M1", inning=1, ball_no=1.2),
    Row(match_id="M1", inning=1, ball_no=2.1),
    Row(match_id="M1", inning=2, ball_no=1.1),
    Row(match_id="M1", inning=2, ball_no=1.2),
    Row(match_id="M2", inning=1, ball_no=1.1),
    Row(match_id="M2", inning=1, ball_no=1.2),
    Row(match_id="M2", inning=1, ball_no=1.3),
    Row(match_id="M2", inning=2, ball_no=1.1),
    Row(match_id="M2", inning=2, ball_no=1.2),
]

ball_df = spark.createDataFrame(sample_data)
ball_df.show()


+--------+------+-------+
|match_id|inning|ball_no|
+--------+------+-------+
|      M1|     1|    1.1|
|      M1|     1|    1.2|
|      M1|     1|    2.1|
|      M1|     2|    1.1|
|      M1|     2|    1.2|
|      M2|     1|    1.1|
|      M2|     1|    1.2|
|      M2|     1|    1.3|
|      M2|     2|    1.1|
|      M2|     2|    1.2|
+--------+------+-------+



In [0]:
from pyspark.sql.functions import count, avg

balls_per_match_inning = ball_df.groupBy("match_id", "inning") \
    .agg(count("ball_no").alias("total_balls"))

avg_balls_per_inning = balls_per_match_inning.groupBy("inning") \
    .agg(avg("total_balls").alias("avg_balls_per_match"))

avg_balls_per_inning.show()


+------+-------------------+
|inning|avg_balls_per_match|
+------+-------------------+
|     1|                3.0|
|     2|                2.0|
+------+-------------------+



In [0]:
from pyspark.sql import Row
sample_matches = [
    Row(year=2021, match_team1="MI", match_team2="RCB", winner="MI"),
    Row(year=2021, match_team1="CSK", match_team2="KKR", winner="CSK"),
    Row(year=2022, match_team1="MI", match_team2="CSK", winner="CSK"),
    Row(year=2022, match_team1="RCB", match_team2="KKR", winner="KKR"),
    Row(year=2023, match_team1="GT", match_team2="RR", winner="GT"),
    Row(year=2023, match_team1="MI", match_team2="RCB", winner="RCB"),
]

matches_df = spark.createDataFrame(sample_matches)
matches_df.show()


+----+-----------+-----------+------+
|year|match_team1|match_team2|winner|
+----+-----------+-----------+------+
|2021|         MI|        RCB|    MI|
|2021|        CSK|        KKR|   CSK|
|2022|         MI|        CSK|   CSK|
|2022|        RCB|        KKR|   KKR|
|2023|         GT|         RR|    GT|
|2023|         MI|        RCB|   RCB|
+----+-----------+-----------+------+



In [0]:
from pyspark.sql.functions import col

team_wins_per_year = matches_df.groupBy("year", "winner") \
    .count().withColumnRenamed("count", "matches_won") \
    .orderBy("year", col("matches_won").desc())

team_wins_per_year.show()


+----+------+-----------+
|year|winner|matches_won|
+----+------+-----------+
|2021|    MI|          1|
|2021|   CSK|          1|
|2022|   CSK|          1|
|2022|   KKR|          1|
|2023|    GT|          1|
|2023|   RCB|          1|
+----+------+-----------+



In [0]:
from pyspark.sql import Row
sample_ball_data = [
    Row(match_id="M1", batting_team="MI", batsman="Rohit Sharma"),
    Row(match_id="M1", batting_team="MI", batsman="Suryakumar Yadav"),
    Row(match_id="M1", batting_team="RCB", batsman="Virat Kohli"),
    Row(match_id="M1", batting_team="RCB", batsman="AB de Villiers"),
    Row(match_id="M2", batting_team="CSK", batsman="MS Dhoni"),
    Row(match_id="M2", batting_team="CSK", batsman="Ruturaj Gaikwad"),
    Row(match_id="M2", batting_team="KKR", batsman="Shubman Gill"),
    Row(match_id="M2", batting_team="KKR", batsman="Andre Russell")
]
ball_df = spark.createDataFrame(sample_ball_data)
ball_df.show()


+--------+------------+----------------+
|match_id|batting_team|         batsman|
+--------+------------+----------------+
|      M1|          MI|    Rohit Sharma|
|      M1|          MI|Suryakumar Yadav|
|      M1|         RCB|     Virat Kohli|
|      M1|         RCB|  AB de Villiers|
|      M2|         CSK|        MS Dhoni|
|      M2|         CSK| Ruturaj Gaikwad|
|      M2|         KKR|    Shubman Gill|
|      M2|         KKR|   Andre Russell|
+--------+------------+----------------+



In [0]:
from pyspark.sql.functions import collect_list, collect_set
batsmen_comparison = ball_df.groupBy("match_id", "batting_team") \
    .agg(
        collect_list("batsman").alias("batsmen_list"),
        collect_set("batsman").alias("batsmen_set")
    )

batsmen_comparison.show(truncate=False)


+--------+------------+--------------------------------+--------------------------------+
|match_id|batting_team|batsmen_list                    |batsmen_set                     |
+--------+------------+--------------------------------+--------------------------------+
|M1      |MI          |[Rohit Sharma, Suryakumar Yadav]|[Suryakumar Yadav, Rohit Sharma]|
|M1      |RCB         |[Virat Kohli, AB de Villiers]   |[Virat Kohli, AB de Villiers]   |
|M2      |CSK         |[MS Dhoni, Ruturaj Gaikwad]     |[MS Dhoni, Ruturaj Gaikwad]     |
|M2      |KKR         |[Shubman Gill, Andre Russell]   |[Shubman Gill, Andre Russell]   |
+--------+------------+--------------------------------+--------------------------------+

