In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType, ArrayType

In [2]:
import os
print("SPARK_HOME:", os.environ.get("SPARK_HOME"))

SPARK_HOME: /opt/bitnami/spark


In [3]:
import os
jars = os.environ.get('SPARK_JARS', '')

In [4]:
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder \
    .config("spark.jars", jars) \
    .appName("Check Spark Version") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark version:", spark.version)


25/06/14 18:25:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


✅ Spark version: 3.5.0


In [5]:
# Đọc dữ liệu từ Kafka topic 'btc-price'
df_price = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "btc-price") \
    .load()

In [6]:
# Kafka trả về key và value dạng binary, cần chuyển sang string
df_string_price = df_price.selectExpr("CAST(value AS STRING) as json_value")

In [7]:
# Nếu dữ liệu là JSON, bạn có thể define schema để parse
schema_price = StructType() \
    .add("symbol", StringType()) \
    .add("price", StringType()) \
    .add("timestamp", StringType())

In [8]:
# Parse JSON
parsed_price_df = df_string_price.select(from_json(col("json_value"), schema_price).alias("data")).select("data.*")

In [None]:
# In ra console
query_price = parsed_price_df.writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query_price.awaitTermination()

In [6]:
df_moving = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "btc-price-moving") \
    .load()

In [7]:
# Kafka trả về key và value dạng binary, cần chuyển sang string
df_string_moving = df_moving.selectExpr("CAST(value AS STRING) as json_value")

In [9]:
# Define schema of each sliding window
schema_window = StructType() \
    .add("window", StringType()) \
    .add("avg_price", DoubleType()) \
    .add("std_price", DoubleType())

# Define schema for each record which consists array of sliding windows defined above
schema_moving = StructType() \
    .add("timestamp", StringType()) \
    .add("symbol", StringType()) \
    .add("windows", ArrayType(schema_window))

In [10]:
parsed_moving_df = df_string_moving.select(from_json(col("json_value"), schema_moving).alias("data")).select("data.*")

In [None]:
# In ra console
query_moving = parsed_moving_df.writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query_moving.awaitTermination()