In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler
import random

import time

kafka_topic_name = "Topic 1"
kafka_bootstrap_servers = 'localhost:9092'

spark = SparkSession \
        .builder \
        .appName("Structured Streaming ") \
        .master("local[*]") \
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [3]:
#  Construct a streaming DataFrame that reads from topic
# spark.readStream is used to get a data 
flower_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic_name) \
        .option("startingOffsets", "earliest") \
        .load()
# flower_df.printSchema()
# kafka dataframe always looks like the printed result


# value is the field we need to start with and it should be a string and not binary
flower_df1 = flower_df.selectExpr("CAST(value AS STRING)", "timestamp") # casts the column named "value" to a string data type.
# flower_df1.printSchema()

#we cannot apply dataframe transfrom on the string we generate from above unless we provide a schema. so we create a schema
flower_schema_string = "order_id INT,sepal_length DOUBLE,sepal_length DOUBLE,sepal_length DOUBLE,sepal_length DOUBLE,species STRING"


# select method selects columns from the DataFrame and allows for transformations to be applied.
# It's used to parse the CSV data into structured data based on the specified schema.
flower_df2 = flower_df1 \
        .select(from_csv(col("value"), flower_schema_string) \
                .alias("flower"), "timestamp")
# alias renames the resulting parsed CSV data as "flower".
flower_df2.printSchema()
# print(flower_df2)

flower_df3 = flower_df2.select("flower.*", "timestamp")
flower_df3.printSchema()

#  creates a temporary view in Spark SQL called "flower_find" based on the DataFrame flower_df3.    
flower_df3.createOrReplaceTempView("flower_find");
song_find_text = spark.sql("SELECT * FROM flower_find")
# print(song_find_text)
flower_agg_write_stream = song_find_text \
        .writeStream \
        .trigger(processingTime='5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("testeddTable") \
        .start()

flower_agg_write_stream.awaitTermination(1)


root
 |-- flower: struct (nullable = true)
 |    |-- order_id: integer (nullable = true)
 |    |-- sepal_length: double (nullable = true)
 |    |-- sepal_length: double (nullable = true)
 |    |-- sepal_length: double (nullable = true)
 |    |-- sepal_length: double (nullable = true)
 |    |-- species: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)

root
 |-- order_id: integer (nullable = true)
 |-- sepal_length: double (nullable = true)
 |-- sepal_length: double (nullable = true)
 |-- sepal_length: double (nullable = true)
 |-- sepal_length: double (nullable = true)
 |-- species: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



False

In [4]:
df = spark.sql("SELECT * FROM testeddTable")
df.show(3)

+--------+------------+------------+------------+------------+-------+---------+
|order_id|sepal_length|sepal_length|sepal_length|sepal_length|species|timestamp|
+--------+------------+------------+------------+------------+-------+---------+
+--------+------------+------------+------------+------------+-------+---------+



In [5]:
df_count = df.count()
df_count

0