In [0]:
from pyspark.sql.functions import sum as _sum, lit
deliveries_df = spark.read.option("header", True).csv("/FileStore/tables/deliveries.csv")
commentary_df = deliveries_df.select(
    "match_id",
    "inning",
    "over",
    "ball",
    "batter",
    "bowler",
    "total_runs"
)

print("Commentary Data:")
commentary_df.show(5)

# Step 3: Prepare over data (over-level aggregation)
over_df = deliveries_df.groupBy(
    "match_id",
    "inning",
    "over"
).agg(
    _sum("total_runs").alias("total_runs")
).withColumn("ball", lit(None)) \
 .withColumn("batter", lit(None)) \
 .withColumn("bowler", lit(None))

print("Over Data:")
over_df.show(5)
union_df = commentary_df.union(over_df)

print("Union Data:")
union_df.show(5)
union_by_name_df = commentary_df.unionByName(over_df)

print("Union By Name Data:")
union_by_name_df.show(5)



Commentary Data:
+--------+------+----+----+-----------+-------+----------+
|match_id|inning|over|ball|     batter| bowler|total_runs|
+--------+------+----+----+-----------+-------+----------+
|  335982|     1|   0|   1| SC Ganguly|P Kumar|         1|
|  335982|     1|   0|   2|BB McCullum|P Kumar|         0|
|  335982|     1|   0|   3|BB McCullum|P Kumar|         1|
|  335982|     1|   0|   4|BB McCullum|P Kumar|         0|
|  335982|     1|   0|   5|BB McCullum|P Kumar|         0|
+--------+------+----+----+-----------+-------+----------+
only showing top 5 rows

Over Data:
+--------+------+----+----------+----+------+------+
|match_id|inning|over|total_runs|ball|batter|bowler|
+--------+------+----+----------+----+------+------+
|  335992|     2|  10|       6.0|null|  null|  null|
|  336001|     1|  16|      10.0|null|  null|  null|
|  336004|     2|  18|       1.0|null|  null|  null|
|  336007|     2|   0|       3.0|null|  null|  null|
|  336008|     1|   3|       5.0|null|  null|

In [0]:
deliveries_df = spark.read.option("header", True).csv("/FileStore/tables/deliveries.csv")
deliveries_df.printSchema()


root
 |-- match_id: string (nullable = true)
 |-- inning: string (nullable = true)
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- over: string (nullable = true)
 |-- ball: string (nullable = true)
 |-- batter: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- non_striker: string (nullable = true)
 |-- batsman_runs: string (nullable = true)
 |-- extra_runs: string (nullable = true)
 |-- total_runs: string (nullable = true)
 |-- extras_type: string (nullable = true)
 |-- is_wicket: string (nullable = true)
 |-- player_dismissed: string (nullable = true)
 |-- dismissal_kind: string (nullable = true)
 |-- fielder: string (nullable = true)

