In [0]:
df = spark.read.option("header", True).csv("/FileStore/tables/ipl_commentary_data.csv")


In [0]:
from pyspark.sql.functions import col, avg, coalesce

df_scores = df.select("year", "team1_name", "team1_score", "team2_name", "team2_score") \
    .withColumnRenamed("team1_name", "team") \
    .withColumn("team1_score", col("team1_score").cast("double")) \
    .withColumn("team2_score", col("team2_score").cast("double"))

df_scores = df_scores.withColumn("score", coalesce(col("team1_score"), col("team2_score")))

df_scores_unified = df_scores.select("year", "team", "score") \
    .union(
        df_scores.select("year", col("team2_name").alias("team"), "team2_score")
    )

df_avg_scores = df_scores_unified.groupBy("year", "team") \
    .agg(
        avg("score").alias("avg_score")
    )

df_avg_pivot = df_avg_scores.groupBy("team") \
    .pivot("year") \
    .agg(
        avg("avg_score")
    )

df_avg_pivot.show()


+--------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------------------+--------------------------------------+----------------------------------------------------------------+
|                team|. Some real confused umpiring here. The same line a couple of overs go wasn't a wide. Samson's not impressed; and he's right in the umpire's ear"|              2017|              2018|              2019|              2020|              2021|              2022|              2023|              2024|              2025|Best: 3/7 vs RR Mumbai WS 202"|Binny st Karthik b Kuldeep Yadav 1(4)"|Rashid Khan's most expensive figures in five matches in IPL 2021|
+--------------------+----------------------------

In [0]:
from pyspark.sql.functions import col, isnull
df_cleaned = df.filter(
    (col("team1_score").cast("double").isNotNull()) & 
    (col("team2_score").cast("double").isNotNull())
)



In [0]:

df_scores = df_cleaned.select("year", "team1_name", "team1_score", "team2_name", "team2_score") \
    .withColumnRenamed("team1_name", "team") \
    .withColumn("team1_score", col("team1_score").cast("double")) \
    .withColumn("team2_score", col("team2_score").cast("double"))

df_scores = df_scores.withColumn("score", coalesce(col("team1_score"), col("team2_score")))

df_scores_unified = df_scores.select("year", "team", "score") \
    .union(
        df_scores.select("year", col("team2_name").alias("team"), "team2_score")
    )

df_avg_scores = df_scores_unified.groupBy("year", "team") \
    .agg(
        avg("score").alias("avg_score")
    )

df_avg_pivot = df_avg_scores.groupBy("team") \
    .pivot("year") \
    .agg(
        avg("avg_score")
    )

df_avg_pivot.show()


+----+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|team|              2017|              2018|              2019|              2020|              2021|              2022|              2023|              2024|              2025|
+----+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
| KKR|158.36411101951086|166.20812182741116|177.26514697060588| 159.5689349112426|152.60365853658536|159.89002403846155|176.45701624815362|201.31263858093126| 163.2472647702407|
|  DC|161.33690330477356| 170.4295010845987|157.98445193055196|163.99326923076924|155.34796076406815|168.33185971117007|155.61205653302568| 186.4934872705743|             211.0|
|  GL|172.35221078134464|              null|              null|              null|              null|         

In [0]:
from pyspark.sql import functions as F

team1_avg = df.select(
    F.col("team1_name").alias("team"),
    F.col("team1_score").alias("score"),
    "year"
)

team2_avg = df.select(
    F.col("team2_name").alias("team"),
    F.col("team2_score").alias("score"),
    "year"
)

team_scores_avg = team1_avg.union(team2_avg)

avg_scores = team_scores_avg.groupBy("team").pivot("year").agg(F.avg("score"))

avg_scores.show()


+--------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------------------+--------------------------------------+----------------------------------------------------------------+
|                team|. Some real confused umpiring here. The same line a couple of overs go wasn't a wide. Samson's not impressed; and he's right in the umpire's ear"|              2017|              2018|              2019|              2020|              2021|              2022|              2023|              2024|              2025|Best: 3/7 vs RR Mumbai WS 202"|Binny st Karthik b Kuldeep Yadav 1(4)"|Rashid Khan's most expensive figures in five matches in IPL 2021|
+--------------------+----------------------------

In [0]:

team1_max = df.select(
    F.col("team1_name").alias("team"),
    F.col("team1_score").alias("score"),
    "year"
)

team2_max = df.select(
    F.col("team2_name").alias("team"),
    F.col("team2_score").alias("score"),
    "year"
)
team_scores_max = team1_max.union(team2_max)

max_scores = team_scores_max.groupBy("team").pivot("year").agg(F.max("score"))
max_scores.show()


+--------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+----+----+----+----+----+----+----+----+----+------------------------------+--------------------------------------+----------------------------------------------------------------+
|                team|. Some real confused umpiring here. The same line a couple of overs go wasn't a wide. Samson's not impressed; and he's right in the umpire's ear"|2017|2018|2019|2020|2021|2022|2023|2024|2025|Best: 3/7 vs RR Mumbai WS 202"|Binny st Karthik b Kuldeep Yadav 1(4)"|Rashid Khan's most expensive figures in five matches in IPL 2021|
+--------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+----+----+----+----+----+----+----+----+----+------------------------------+--------------------------------------+-------------------