In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType

schema = StructType() \
    .add("order_id", StringType()) \
    .add("customer_id", StringType()) \
    .add("product", StringType()) \
    .add("quantity", IntegerType()) \
    .add("region", StringType())

initial_data = [
    ("1", "C101", "Laptop", 2, "South"),
    ("2", "C102", "Chair", 6, "North"),
    ("3", "C103", "Mobile", 1, "East")
]

df = spark.createDataFrame(initial_data, schema)

df.write \
  .mode("overwrite") \
  .option("header", True) \
  .csv("dbfs:/tmp/stream/orders")

In [0]:
orders_stream = (
    spark.readStream
    .schema(schema)
    .option("header", True)
    .csv("dbfs:/tmp/stream/orders")
)

In [0]:
from pyspark.sql.functions import when

transformed_orders = orders_stream.withColumn(
    "bulk_order", when(orders_stream["quantity"] > 5, True).otherwise(False)
)

In [0]:
from pyspark.sql.functions import col# Same rate stream and transformation

rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

transformed_df = rate_df.withColumn("is_even", (col("value") % 2 == 0))

# Write to memory (temp table)
query = (
    transformed_df.writeStream
    .format("memory")  # 👈 this is key
    .queryName("rate_table")  # 👈 table name to query later
    .outputMode("append")
    .start()
)

In [0]:
spark.sql("SELECT * FROM rate_table").show()

+--------------------+-----+-------+
|           timestamp|value|is_even|
+--------------------+-----+-------+
|2025-08-08 11:07:...|    0|   true|
|2025-08-08 11:07:...|    1|  false|
|2025-08-08 11:07:...|    2|   true|
|2025-08-08 11:07:...|    3|  false|
|2025-08-08 11:07:...|    4|   true|
|2025-08-08 11:07:...|    5|  false|
|2025-08-08 11:07:...|    6|   true|
|2025-08-08 11:07:...|    7|  false|
|2025-08-08 11:07:...|    8|   true|
|2025-08-08 11:07:...|    9|  false|
|2025-08-08 11:07:...|   10|   true|
|2025-08-08 11:07:...|   11|  false|
|2025-08-08 11:07:...|   12|   true|
|2025-08-08 11:07:...|   13|  false|
|2025-08-08 11:07:...|   14|   true|
|2025-08-08 11:07:...|   15|  false|
|2025-08-08 11:07:...|   16|   true|
|2025-08-08 11:07:...|   17|  false|
|2025-08-08 11:07:...|   18|   true|
|2025-08-08 11:07:...|   19|  false|
+--------------------+-----+-------+
only showing top 20 rows


In [0]:
from pyspark.sql.functions import col# Same rate stream and transformation

rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

transformed_df = rate_df.withColumn("is_odd", (col("value") % 2 != 0))

# Write to memory (temp table)
query = (
    transformed_df.writeStream
    .format("memory")  # 👈 this is key
    .queryName("rate_table")  # 👈 table name to query later
    .outputMode("append")
    .start()
)

In [0]:
spark.sql("SELECT * FROM rate_table").show()

+--------------------+-----+------+
|           timestamp|value|is_odd|
+--------------------+-----+------+
|2025-08-08 11:16:...|    0| false|
|2025-08-08 11:16:...|    1|  true|
|2025-08-08 11:16:...|    2| false|
|2025-08-08 11:16:...|    3|  true|
|2025-08-08 11:16:...|    4| false|
|2025-08-08 11:16:...|    5|  true|
|2025-08-08 11:16:...|    6| false|
|2025-08-08 11:16:...|    7|  true|
|2025-08-08 11:16:...|    8| false|
|2025-08-08 11:16:...|    9|  true|
|2025-08-08 11:16:...|   10| false|
|2025-08-08 11:16:...|   11|  true|
|2025-08-08 11:16:...|   12| false|
|2025-08-08 11:16:...|   13|  true|
|2025-08-08 11:16:...|   14| false|
|2025-08-08 11:16:...|   15|  true|
|2025-08-08 11:16:...|   16| false|
|2025-08-08 11:16:...|   17|  true|
|2025-08-08 11:16:...|   18| false|
|2025-08-08 11:17:...|   19|  true|
+--------------------+-----+------+
only showing top 20 rows


In [0]:
from pyspark.sql.functions import col# Same rate stream and transformation

rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

transformed_df = rate_df.withColumn("is_multiple_of_5", (col("value") % 5 == 0))

# Write to memory (temp table)
query = (
    transformed_df.writeStream
    .format("memory")  # 👈 this is key
    .queryName("rate_table")  # 👈 table name to query later
    .outputMode("append")
    .start()
)

In [0]:
spark.sql("SELECT * FROM rate_table").show()

+--------------------+-----+----------------+
|           timestamp|value|is_multiple_of_5|
+--------------------+-----+----------------+
|2025-08-08 11:22:...|    0|            true|
|2025-08-08 11:22:...|    1|           false|
|2025-08-08 11:22:...|    2|           false|
|2025-08-08 11:22:...|    3|           false|
|2025-08-08 11:22:...|    4|           false|
|2025-08-08 11:22:...|    5|            true|
|2025-08-08 11:22:...|    6|           false|
|2025-08-08 11:22:...|    7|           false|
|2025-08-08 11:22:...|    8|           false|
|2025-08-08 11:22:...|    9|           false|
|2025-08-08 11:22:...|   10|            true|
|2025-08-08 11:22:...|   11|           false|
|2025-08-08 11:22:...|   12|           false|
|2025-08-08 11:22:...|   13|           false|
+--------------------+-----+----------------+

