In [0]:
ipl_matches_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/ipl_2021_matches.csv")


In [0]:
ipl_matches_df.printSchema()
ipl_matches_df.show(5, truncate=False)


root
 |-- year: string (nullable = true)
 |-- series_type: string (nullable = true)
 |-- series_name: string (nullable = true)
 |-- match_no: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- match_name: string (nullable = true)
 |-- match_href: string (nullable = true)
 |-- match_team1: string (nullable = true)
 |-- match_team2: string (nullable = true)
 |-- match_datetime_start: string (nullable = true)
 |-- match_date_end: string (nullable = true)
 |-- match_venue: string (nullable = true)

+----+-----------+--------------------------+--------+----------+--------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------+-----------+--------------------+--------------+-----------+
|year|series_type|series_name               |match_no|match_type|match_name                                              |match_href                                                          

In [0]:
from pyspark.sql.functions import regexp_extract

df_bowler = df.withColumn("Bowler", regexp_extract("commentary", r"^(.+?) to", 1))
df_bowler.select("commentary", "Bowler").show(truncate=False)


+--------------------------------------+----------------+
|commentary                            |Bowler          |
+--------------------------------------+----------------+
|Jasprit Bumrah to Virat Kohli, 1 run  |Jasprit Bumrah  |
|Rashid Khan to MS Dhoni, SIX runs     |Rashid Khan     |
|Trent Boult to Ruturaj Gaikwad, no run|Trent Boult     |
|Sunil Narine to Rohit Sharma, FOUR    |Sunil Narine    |
|Yuzvendra Chahal to KL Rahul, OUT     |Yuzvendra Chahal|
+--------------------------------------+----------------+



In [0]:
df_batsman = df.withColumn("Batsman", regexp_extract("commentary", r"to (.+?),", 1))
df_batsman.select("commentary", "Batsman").show(truncate=False)


+--------------------------------------+---------------+
|commentary                            |Batsman        |
+--------------------------------------+---------------+
|Jasprit Bumrah to Virat Kohli, 1 run  |Virat Kohli    |
|Rashid Khan to MS Dhoni, SIX runs     |MS Dhoni       |
|Trent Boult to Ruturaj Gaikwad, no run|Ruturaj Gaikwad|
|Sunil Narine to Rohit Sharma, FOUR    |Rohit Sharma   |
|Yuzvendra Chahal to KL Rahul, OUT     |KL Rahul       |
+--------------------------------------+---------------+



In [0]:
df_runs = df.withColumn("Runs", regexp_extract("commentary", r", (.+)", 1))
df_runs.select("commentary", "Runs").show(truncate=False)


+--------------------------------------+--------+
|commentary                            |Runs    |
+--------------------------------------+--------+
|Jasprit Bumrah to Virat Kohli, 1 run  |1 run   |
|Rashid Khan to MS Dhoni, SIX runs     |SIX runs|
|Trent Boult to Ruturaj Gaikwad, no run|no run  |
|Sunil Narine to Rohit Sharma, FOUR    |FOUR    |
|Yuzvendra Chahal to KL Rahul, OUT     |OUT     |
+--------------------------------------+--------+



In [0]:
from pyspark.sql.functions import when, col

df_runs_scored = df.withColumn("Runs", regexp_extract("commentary", r", (.+)", 1)) \
    .withColumn("Runs_Scored", 
        when(col("Runs").rlike("SIX"), 6)
        .when(col("Runs").rlike("FOUR"), 4)
        .when(col("Runs").rlike("3 runs"), 3)
        .when(col("Runs").rlike("2 runs"), 2)
        .when(col("Runs").rlike("1 run"), 1)
        .when(col("Runs").rlike("no run"), 0)
        .otherwise(None)
    )

df_runs_scored.select("commentary", "Runs", "Runs_Scored").show(truncate=False)


+--------------------------------------+--------+-----------+
|commentary                            |Runs    |Runs_Scored|
+--------------------------------------+--------+-----------+
|Jasprit Bumrah to Virat Kohli, 1 run  |1 run   |1          |
|Rashid Khan to MS Dhoni, SIX runs     |SIX runs|6          |
|Trent Boult to Ruturaj Gaikwad, no run|no run  |0          |
|Sunil Narine to Rohit Sharma, FOUR    |FOUR    |4          |
|Yuzvendra Chahal to KL Rahul, OUT     |OUT     |null       |
+--------------------------------------+--------+-----------+



In [0]:
from pyspark.sql.functions import regexp_extract, when, col

df_runscored_only = df.withColumn("Runs", regexp_extract("commentary", r", (.+)", 1)) \
    .withColumn("Runs_Scored", 
        when(col("Runs").rlike("SIX"), 6)
        .when(col("Runs").rlike("FOUR"), 4)
        .when(col("Runs").rlike("3 runs"), 3)
        .when(col("Runs").rlike("2 runs"), 2)
        .when(col("Runs").rlike("1 run"), 1)
        .when(col("Runs").rlike("no run"), 0)
        .otherwise(None)
    )

df_runscored_only.select("commentary", "Runs_Scored").show(truncate=False)


+--------------------------------------+-----------+
|commentary                            |Runs_Scored|
+--------------------------------------+-----------+
|Jasprit Bumrah to Virat Kohli, 1 run  |1          |
|Rashid Khan to MS Dhoni, SIX runs     |6          |
|Trent Boult to Ruturaj Gaikwad, no run|0          |
|Sunil Narine to Rohit Sharma, FOUR    |4          |
|Yuzvendra Chahal to KL Rahul, OUT     |null       |
+--------------------------------------+-----------+



In [0]:
from pyspark.sql.functions import regexp_extract, when, col

In [0]:
ipl_matches_df = spark.read.option("header", "true").csv("dbfs:/FileStore/tables/ipl_2021_matches.csv")


In [0]:
df_extracted = df.withColumn("Bowler", regexp_extract("commentary", r"^([^\s]+(?:\s[^\s]+)*) to", 1)) \
                 .withColumn("Batsman", regexp_extract("commentary", r"to ([^,]+)", 1)) \
                 .withColumn("Runs", regexp_extract("commentary", r", (.+)", 1))


In [0]:
df_final = df_extracted.withColumn("Runs_Scored", 
    when(col("Runs").rlike("SIX"), 6)
    .when(col("Runs").rlike("FOUR"), 4)
    .when(col("Runs").rlike("3 runs"), 3)
    .when(col("Runs").rlike("2 runs"), 2)
    .when(col("Runs").rlike("1 run"), 1)
    .when(col("Runs").rlike("no run"), 0)
    .otherwise(None)
)


In [0]:
df_final.select("commentary", "Bowler", "Batsman", "Runs", "Runs_Scored").show(20, truncate=False)


+--------------------------------------+----------------+---------------+--------+-----------+
|commentary                            |Bowler          |Batsman        |Runs    |Runs_Scored|
+--------------------------------------+----------------+---------------+--------+-----------+
|Jasprit Bumrah to Virat Kohli, 1 run  |Jasprit Bumrah  |Virat Kohli    |1 run   |1          |
|Rashid Khan to MS Dhoni, SIX runs     |Rashid Khan     |MS Dhoni       |SIX runs|6          |
|Trent Boult to Ruturaj Gaikwad, no run|Trent Boult     |Ruturaj Gaikwad|no run  |0          |
|Sunil Narine to Rohit Sharma, FOUR    |Sunil Narine    |Rohit Sharma   |FOUR    |4          |
|Yuzvendra Chahal to KL Rahul, OUT     |Yuzvendra Chahal|KL Rahul       |OUT     |null       |
+--------------------------------------+----------------+---------------+--------+-----------+

