In [0]:
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType


In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
commentary_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("ball", DoubleType(), True),
    StructField("commentary", StringType(), True)
])

In [0]:
commentary_data = [
    ("M1", 1.1, "Rohit Sharma hits a four off Bumrah"),
    ("M1", 1.2, "Dhoni hits a six off Chahal"),
    ("M1", 1.3, "Gill hits a dot ball off Rashid Khan"),
    ("M2", 2.1, "Kohli hits a single off Jadeja"),
    ("M2", 2.2, "Pant hits a three off Ashwin"),
    ("M2", 2.3, "No run off Axar Patel")
]

In [0]:
commentary_df = spark.createDataFrame(commentary_data, commentary_schema)
commentary_df.createOrReplaceTempView("commentary")


In [0]:
def extract_runs(text):
    text = text.lower()
    if "six" in text:
        return 6
    elif "four" in text:
        return 4
    elif "three" in text:
        return 3
    elif "two" in text:
        return 2
    elif "single" in text or "one" in text:
        return 1
    elif "no run" in text or "dot" in text:
        return 0
    else:
        return None

def extract_batsman(text):
    match = re.match(r"([A-Za-z\s]+?) hits", text)
    return match.group(1).strip() if match else None

def extract_bowler(text):
    match = re.search(r"off ([A-Za-z\s]+)", text)
    return match.group(1).strip() if match else None


In [0]:
extract_runs_udf = udf(extract_runs, IntegerType())
extract_batsman_udf = udf(extract_batsman, StringType())
extract_bowler_udf = udf(extract_bowler, StringType())

In [0]:
final_df = commentary_df \
    .withColumn("runs", extract_runs_udf("commentary")) \
    .withColumn("batsman", extract_batsman_udf("commentary")) \
    .withColumn("bowler", extract_bowler_udf("commentary"))


In [0]:
final_df.show(truncate=False)

+--------+----+------------------------------------+----+------------+-----------+
|match_id|ball|commentary                          |runs|batsman     |bowler     |
+--------+----+------------------------------------+----+------------+-----------+
|M1      |1.1 |Rohit Sharma hits a four off Bumrah |4   |Rohit Sharma|Bumrah     |
|M1      |1.2 |Dhoni hits a six off Chahal         |6   |Dhoni       |Chahal     |
|M1      |1.3 |Gill hits a dot ball off Rashid Khan|0   |Gill        |Rashid Khan|
|M2      |2.1 |Kohli hits a single off Jadeja      |1   |Kohli       |Jadeja     |
|M2      |2.2 |Pant hits a three off Ashwin        |3   |Pant        |Ashwin     |
|M2      |2.3 |No run off Axar Patel               |0   |null        |Axar Patel |
+--------+----+------------------------------------+----+------------+-----------+

