In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col

raw_data = [
    Row(order_id=1, product="iPhone", qty=8, price=14000),
    Row(order_id=2, product="iPad", qty=6, price=50000),
    Row(order_id=3, product="MacBook", qty=2, price=120000),
    Row(order_id=4, product="iPhone", qty=1, price=70000)
]

raw_df = spark.createDataFrame(raw_data)
raw_df.show()


+--------+-------+---+------+
|order_id|product|qty| price|
+--------+-------+---+------+
|       1| iPhone|  8| 14000|
|       2|   iPad|  6| 50000|
|       3|MacBook|  2|120000|
|       4| iPhone|  1| 70000|
+--------+-------+---+------+



In [0]:
silver_df = (
    raw_df
    .filter(col("qty") > 0)
    .filter(col("price") > 0)
    .withColumn("total_amount", col("qty") * col("price"))
)

silver_df.show()


+--------+-------+---+------+------------+
|order_id|product|qty| price|total_amount|
+--------+-------+---+------+------------+
|       1| iPhone|  8| 14000|      112000|
|       2|   iPad|  6| 50000|      300000|
|       3|MacBook|  2|120000|      240000|
|       4| iPhone|  1| 70000|       70000|
+--------+-------+---+------+------------+



In [0]:
from pyspark.sql.functions import sum, count

gold_df = (
    silver_df
    .groupBy("product")
    .agg(
        sum("total_amount").alias("total_revenue"),
        count("*").alias("total_orders")
    )
)

gold_df.show()


+-------+-------------+------------+
|product|total_revenue|total_orders|
+-------+-------------+------------+
| iPhone|       182000|           2|
|   iPad|       300000|           1|
|MacBook|       240000|           1|
+-------+-------------+------------+



In [0]:
gold_df.write.mode("overwrite").saveAsTable("gold_product_sales")


In [0]:
spark.table("gold_product_sales").show()


+-------+-------------+------------+
|product|total_revenue|total_orders|
+-------+-------------+------------+
| iPhone|       182000|           2|
|   iPad|       300000|           1|
|MacBook|       240000|           1|
+-------+-------------+------------+

