In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
data = [
    (1, 1, 1, 1),
    (1, 1, 2, 0),
    (1, 1, 3, 4),
    (1, 1, 4, 2),
    (1, 2, 1, 6),
    (1, 2, 2, 1)
]


In [0]:
columns = ["match_id", "inning", "ball_no", "runs"]
df = spark.createDataFrame(data, columns)


In [0]:
windowSpec = Window.partitionBy("match_id", "inning").orderBy("ball_no")
df_with_cumulative_runs = df.withColumn(
    "cumulative_runs", F.sum("runs").over(windowSpec)
)


In [0]:
df_with_cumulative_runs.select("match_id", "inning", "ball_no", "runs", "cumulative_runs").show(truncate=False)


+--------+------+-------+----+---------------+
|match_id|inning|ball_no|runs|cumulative_runs|
+--------+------+-------+----+---------------+
|1       |1     |1      |1   |1              |
|1       |1     |2      |0   |1              |
|1       |1     |3      |4   |5              |
|1       |1     |4      |2   |7              |
|1       |2     |1      |6   |6              |
|1       |2     |2      |1   |7              |
+--------+------+-------+----+---------------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, StringType

data = [
    ("Bumrah to Dhawan, FOUR! Cracking shot through covers", "Bumrah", "Dhawan", "FOUR"),
    ("Chahal to Raina, 1 run, nudged to midwicket", "Chahal", "Raina", "1 run"),
    ("Narine to Kohli, no run, defended solidly", "Narine", "Kohli", "no run"),
    ("Rabada to Rohit, SIX! Smashed over long-on", "Rabada", "Rohit", "SIX")
]

schema = StructType([
    StructField("commentary", StringType(), True),
    StructField("bowler", StringType(), True),
    StructField("batsman", StringType(), True),
    StructField("runs", StringType(), True)
])

commentary_df = spark.createDataFrame(data, schema)

from pyspark.sql.functions import monotonically_increasing_id

commentary_df = commentary_df.withColumn("match_id", lit(1)) \
                             .withColumn("inning", lit(1)) \
                             .withColumn("ball_no", monotonically_increasing_id())

commentary_df.show(truncate=False)

+----------------------------------------------------+------+-------+------+--------+------+-----------+
|commentary                                          |bowler|batsman|runs  |match_id|inning|ball_no    |
+----------------------------------------------------+------+-------+------+--------+------+-----------+
|Bumrah to Dhawan, FOUR! Cracking shot through covers|Bumrah|Dhawan |FOUR  |1       |1     |8589934592 |
|Chahal to Raina, 1 run, nudged to midwicket         |Chahal|Raina  |1 run |1       |1     |25769803776|
|Narine to Kohli, no run, defended solidly           |Narine|Kohli  |no run|1       |1     |42949672960|
|Rabada to Rohit, SIX! Smashed over long-on          |Rabada|Rohit  |SIX   |1       |1     |60129542144|
+----------------------------------------------------+------+-------+------+--------+------+-----------+



In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
window_spec = Window.partitionBy("match_id", "inning").orderBy("ball_no")
df_with_cumulative_runs = df_extracted.withColumn(
    "cumulative_runs", 
    F.sum("runs").over(window_spec)
)
df_with_cumulative_runs.select("match_id", "inning", "ball_no", "batsman", "runs", "cumulative_runs").show(truncate=False)


+--------+------+-----------+-------+----+---------------+
|match_id|inning|ball_no    |batsman|runs|cumulative_runs|
+--------+------+-----------+-------+----+---------------+
|1       |1     |8589934592 |Dhawan |4   |4              |
|1       |1     |25769803776|Raina  |1   |5              |
|1       |1     |42949672960|Kohli  |0   |5              |
|1       |1     |60129542144|Rohit  |6   |11             |
+--------+------+-----------+-------+----+---------------+

