In [0]:
ipl_matches_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/ipl_2021_matches.csv")


In [0]:
ipl_matches_df.columns


Out[7]: ['year',
 'series_type',
 'series_name',
 'match_no',
 'match_type',
 'match_name',
 'match_href',
 'match_team1',
 'match_team2',
 'match_datetime_start',
 'match_date_end',
 'match_venue']

In [0]:
ipl_matches_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/ipl_2021_matches.csv")
ipl_matches_df.show(5, truncate=False)


+----+-----------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|series_name               |match_no|match_type|match_name                                              |match_href                                                                                   |match_team1|match_team2|match_datetime_start|match_date_end|match_venue|
+----+-----------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|2021|T20        |Indian Premier League 2021|null    |League    |MUMBAI INDIANS vs ROYAL CHALLENGERS BENGALURU, 1st Match|https://www

In [0]:

team_match_count = ipl_matches_df.groupBy("year", "match_team1").count() \
    .withColumnRenamed("count", "matches_played") \
    .orderBy("year", "match_team1")

team_match_count.show(truncate=False)


+----+-----------+--------------+
|year|match_team1|matches_played|
+----+-----------+--------------+
|2021|null       |62            |
+----+-----------+--------------+



In [0]:
team_match_count_team2 = ipl_matches_df.groupBy("year", "match_team2").count() \
    .withColumnRenamed("count", "matches_played") \
    .orderBy("year", "match_team2")

team_match_count_team2.show(truncate=False)

+----+-----------+--------------+
|year|match_team2|matches_played|
+----+-----------+--------------+
|2021|null       |62            |
+----+-----------+--------------+



In [0]:
from pyspark.sql.functions import col, regexp_extract, lit, sum as _sum

data = [
    ("Bumrah to Dhawan, FOUR! Cracking shot through covers", "Bumrah", "Dhawan", "FOUR"),
    ("Chahal to Raina, 1 run, nudged to midwicket", "Chahal", "Raina", "1 run"),
    ("Narine to Kohli, no run, defended solidly", "Narine", "Kohli", "no run"),
    ("Rabada to Rohit, SIX! Smashed over long-on", "Rabada", "Rohit", "SIX"),
]

ball_df = spark.createDataFrame(data, ["commentary", "bowler", "batsman", "runs_raw"])

ball_df = ball_df.withColumn(
    "runs",
    regexp_extract("runs_raw", r"(\d+)", 1).cast("int")
).fillna(0)

ball_df = ball_df.withColumn("match_id", col("bowler")) \
                 .withColumn("team", col("bowler")) \
                 .withColumn("year", lit(2021))

ball_df.show()

+--------------------+------+-------+--------+----+--------+------+----+
|          commentary|bowler|batsman|runs_raw|runs|match_id|  team|year|
+--------------------+------+-------+--------+----+--------+------+----+
|Bumrah to Dhawan,...|Bumrah| Dhawan|    FOUR|   0|  Bumrah|Bumrah|2021|
|Chahal to Raina, ...|Chahal|  Raina|   1 run|   1|  Chahal|Chahal|2021|
|Narine to Kohli, ...|Narine|  Kohli|  no run|   0|  Narine|Narine|2021|
|Rabada to Rohit, ...|Rabada|  Rohit|     SIX|   0|  Rabada|Rabada|2021|
+--------------------+------+-------+--------+----+--------+------+----+



In [0]:
from pyspark.sql import functions as F
team_scores_df = ball_df.groupBy("year", "match_id", "team") \
                        .agg(F.sum("runs").alias("total_score"))

team_scores_df.show(truncate=False)


+----+--------+------+-----------+
|year|match_id|team  |total_score|
+----+--------+------+-----------+
|2021|Bumrah  |Bumrah|0          |
|2021|Chahal  |Chahal|1          |
|2021|Narine  |Narine|0          |
|2021|Rabada  |Rabada|0          |
+----+--------+------+-----------+



In [0]:
avg_score_pivot = team_scores_df.groupBy("team") \
    .pivot("year") \
    .avg("total_score") \
    .orderBy("team")

avg_score_pivot.show()


+------+----+
|  team|2021|
+------+----+
|Bumrah| 0.0|
|Chahal| 1.0|
|Narine| 0.0|
|Rabada| 0.0|
+------+----+



In [0]:
max_score_pivot = team_scores_df.groupBy("team") \
    .pivot("year") \
    .max("total_score") \
    .orderBy("team")

max_score_pivot.show()

+------+----+
|  team|2021|
+------+----+
|Bumrah|   0|
|Chahal|   1|
|Narine|   0|
|Rabada|   0|
+------+----+

