In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct

In [0]:
spark = SparkSession.builder.appName("IPL EDA").getOrCreate()

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/matches.csv")

In [0]:
df.show(5)

+------+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+---------+--------------+
|    id| season|      city|      date|match_type|player_of_match|               venue|               team1|               team2|         toss_winner|toss_decision|              winner| result|result_margin|target_runs|target_overs|super_over|method|  umpire1|       umpire2|
+------+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+---------+--------------+
|335982|2007/08| Bangalore|2008-04-18|    League|    BB McCullum|M Chinnaswamy Sta...|Royal Challengers...|Kolkata Knight Ri...|Royal Challengers...|        field|Kolkata Knig

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- player_of_match: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- toss_winner: string (nullable = true)
 |-- toss_decision: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- result: string (nullable = true)
 |-- result_margin: string (nullable = true)
 |-- target_runs: string (nullable = true)
 |-- target_overs: string (nullable = true)
 |-- super_over: string (nullable = true)
 |-- method: string (nullable = true)
 |-- umpire1: string (nullable = true)
 |-- umpire2: string (nullable = true)



In [0]:
teams_df = df.select("team1").union(df.select("team2")).distinct()
teams_df.show(truncate=False)
print(f" Total number of unique teams: {teams_df.count()}")



+---------------------------+
|team1                      |
+---------------------------+
|Sunrisers Hyderabad        |
|Lucknow Super Giants       |
|Chennai Super Kings        |
|Gujarat Titans             |
|Royal Challengers Bengaluru|
|Rising Pune Supergiant     |
|Deccan Chargers            |
|Kochi Tuskers Kerala       |
|Rajasthan Royals           |
|Gujarat Lions              |
|Royal Challengers Bangalore|
|Kolkata Knight Riders      |
|Rising Pune Supergiants    |
|Kings XI Punjab            |
|Punjab Kings               |
|Pune Warriors              |
|Delhi Daredevils           |
|Delhi Capitals             |
|Mumbai Indians             |
+---------------------------+

✅ Total number of unique teams: 19


In [0]:
print(" Tie-breaker techniques used when result is 'tie':")
df.filter(col("result") == "tie").select("season", "team1", "team2", "super_over", "method").distinct().show()


✅ Tie-breaker techniques used when result is 'tie':
+-------+--------------------+--------------------+----------+------+
| season|               team1|               team2|super_over|method|
+-------+--------------------+--------------------+----------+------+
|   2021|      Delhi Capitals| Sunrisers Hyderabad|         Y|    NA|
|2009/10| Chennai Super Kings|     Kings XI Punjab|         Y|    NA|
|   2009|Kolkata Knight Ri...|    Rajasthan Royals|         Y|    NA|
|2020/21|      Delhi Capitals|     Kings XI Punjab|         Y|    NA|
|   2019|Kolkata Knight Ri...|      Delhi Capitals|         Y|    NA|
|2020/21|      Mumbai Indians|     Kings XI Punjab|         Y|    NA|
|   2014|Kolkata Knight Ri...|    Rajasthan Royals|         Y|    NA|
|   2019|      Mumbai Indians| Sunrisers Hyderabad|         Y|    NA|
|   2013|Royal Challengers...|    Delhi Daredevils|         Y|    NA|
|   2017|       Gujarat Lions|      Mumbai Indians|         Y|    NA|
|2020/21|Royal Challengers...|      Mu

In [0]:
completed_matches = df.filter((col("result").isNotNull()) & (col("result") != "no result"))
print("Completed matches:")
completed_matches.select("season", "team1", "team2", "winner", "result", "result_margin").show(5)


Completed matches:
+-------+--------------------+--------------------+--------------------+-------+-------------+
| season|               team1|               team2|              winner| result|result_margin|
+-------+--------------------+--------------------+--------------------+-------+-------------+
|2007/08|Royal Challengers...|Kolkata Knight Ri...|Kolkata Knight Ri...|   runs|          140|
|2007/08|     Kings XI Punjab| Chennai Super Kings| Chennai Super Kings|   runs|           33|
|2007/08|    Delhi Daredevils|    Rajasthan Royals|    Delhi Daredevils|wickets|            9|
|2007/08|      Mumbai Indians|Royal Challengers...|Royal Challengers...|wickets|            5|
|2007/08|Kolkata Knight Ri...|     Deccan Chargers|Kolkata Knight Ri...|wickets|            5|
+-------+--------------------+--------------------+--------------------+-------+-------------+
only showing top 5 rows



In [0]:
columns_to_drop = ["umpire1", "umpire2", "id"]
cleaned_df = completed_matches.drop(*columns_to_drop)
print(" Cleaned DataFrame:")
cleaned_df.show(5)

 Cleaned DataFrame:
+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+
| season|      city|      date|match_type|player_of_match|               venue|               team1|               team2|         toss_winner|toss_decision|              winner| result|result_margin|target_runs|target_overs|super_over|method|
+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+
|2007/08| Bangalore|2008-04-18|    League|    BB McCullum|M Chinnaswamy Sta...|Royal Challengers...|Kolkata Knight Ri...|Royal Challengers...|        field|Kolkata Knight Ri...|   runs|          140|        223|          20|         N|    NA|
|2007/08

In [0]:
print(f" Total matches played: {cleaned_df.count()}")

 Total matches played: 1090


In [0]:
print(" Matches per season:")
cleaned_df.groupBy("season").count().orderBy("season").show()


 Matches per season:
+-------+-----+
| season|count|
+-------+-----+
|2007/08|   58|
|   2009|   57|
|2009/10|   60|
|   2011|   72|
|   2012|   74|
|   2013|   76|
|   2014|   60|
|   2015|   57|
|   2016|   60|
|   2017|   59|
|   2018|   60|
|   2019|   59|
|2020/21|   60|
|   2021|   60|
|   2022|   74|
|   2023|   73|
|   2024|   71|
+-------+-----+



In [0]:
print(" Top 5 winning teams:")
cleaned_df.groupBy("winner").count().orderBy(col("count").desc()).show(5)


 Top 5 winning teams:
+--------------------+-----+
|              winner|count|
+--------------------+-----+
|      Mumbai Indians|  144|
| Chennai Super Kings|  138|
|Kolkata Knight Ri...|  131|
|Royal Challengers...|  116|
|    Rajasthan Royals|  112|
+--------------------+-----+
only showing top 5 rows



In [0]:
print(" Top 5 most used venues:")
cleaned_df.groupBy("venue").count().orderBy(col("count").desc()).show(5)

 Top 5 most used venues:
+--------------------+-----+
|               venue|count|
+--------------------+-----+
|        Eden Gardens|   77|
|    Wankhede Stadium|   73|
|M Chinnaswamy Sta...|   63|
|    Feroz Shah Kotla|   59|
|Rajiv Gandhi Inte...|   49|
+--------------------+-----+
only showing top 5 rows



In [0]:
tie_breakers = df.select("super_over", "method", "result") \
    .where((col("result") == "tie") | (col("super_over") == "Y") | (col("method").isNotNull())) \
    .distinct()

print(" Tie-breaker techniques used (e.g., Super Over or Method like DLS):")
tie_breakers.show(truncate=False)

 Tie-breaker techniques used (e.g., Super Over or Method like DLS):
+----------+------+---------+
|super_over|method|result   |
+----------+------+---------+
|N         |D/L   |runs     |
|N         |D/L   |wickets  |
|N         |NA    |no result|
|N         |NA    |runs     |
|N         |NA    |wickets  |
|Y         |NA    |tie      |
+----------+------+---------+



In [0]:
cleaned_df = df.filter((col("result").isNotNull()) & (col("result") != "no result")) \
               .drop("id", "umpire1", "umpire2")

In [0]:
cleaned_df.write.format("delta").mode("overwrite").saveAsTable("ipl_completed_matches_delta")

In [0]:
spark.sql("SHOW TABLES").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|ipl_completed_mat...|      false|
+--------+--------------------+-----------+



In [0]:
spark.sql("SELECT * FROM ipl_completed_matches_delta LIMIT 5").show()

+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+
| season|      city|      date|match_type|player_of_match|               venue|               team1|               team2|         toss_winner|toss_decision|              winner| result|result_margin|target_runs|target_overs|super_over|method|
+-------+----------+----------+----------+---------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+-----------+------------+----------+------+
|2007/08| Bangalore|2008-04-18|    League|    BB McCullum|M Chinnaswamy Sta...|Royal Challengers...|Kolkata Knight Ri...|Royal Challengers...|        field|Kolkata Knight Ri...|   runs|          140|        223|          20|         N|    NA|
|2007/08|Chandigarh|2008-04-