In [0]:
# Step 1: Read the CSV into DataFrame
# Make sure the file path is correct for your environment
deliveries_df = spark.read.option("header", True).csv("/FileStore/tables/deliveries.csv")

# Step 2: Show the Data
print("Initial Data:")
deliveries_df.show(5)

# Step 3: Select relevant columns (transformation step 1)
selected_df = deliveries_df.select(
    "match_id",
    "inning",
    "batting_team",
    "bowling_team",
    "over",
    "ball",
    "batter",
    "bowler",
    "non_striker"
)

print("Selected Columns Data:")
selected_df.show(5)

# Step 4: Perform aggregation to create stages in the DAG (transformation step 2)
from pyspark.sql.functions import count

aggregated_df = selected_df.groupBy("batting_team").agg(
    count("ball").alias("total_balls")
)

print("Aggregated Data:")
aggregated_df.show()

# Step 5: Write the aggregated data as a Delta Table (action step)
aggregated_df.write.format("delta").mode("overwrite").save("/FileStore/tables/aggregated_deliveries_delta")

print("Data written to Delta table successfully!")

# Optional: Step 6: Read back the Delta table to check
delta_df = spark.read.format("delta").load("/FileStore/tables/aggregated_deliveries_delta")
print("Delta Table Data:")
delta_df.show()


Initial Data:
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|        batting_team|        bowling_team|over|ball|     batter| bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1| SC Ganguly|P Kumar|BB McCullum|           0|         1|         1|    legbyes|        0|              NA|            NA|     NA|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|BB McCullum|P Kumar| SC Ganguly|           0|         0|         0|       null|        0|              NA|            NA|