In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, DoubleType, StringType, IntegerType, StructField
from pyspark.sql.functions import col, split, trim
from pyspark.sql.functions import regexp_replace

In [12]:
spark = SparkSession.builder \
    .appName("KafkaToElasticsearchHwinfoLogs") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,org.elasticsearch:elasticsearch-spark-30_2.12:8.10.1") \
    .getOrCreate()

In [13]:
spark

In [14]:
df_kafka = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "hwinfo") \
    .option("startingOffsets", "latest") \
    .option("failOnDataLoss", "false") \
    .load()


In [15]:
hwinfo_schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Core_Clocks_avg_MHz", IntegerType(), True),
    StructField("Bus_Clock_MHz", DoubleType(), True),
    StructField("Core_Usage_avg_percent", DoubleType(), True),
    StructField("Core_Temperatures_avg_C", DoubleType(), True),
    StructField("CPU_Package_C", IntegerType(), True),
    StructField("CPU_Package_Power_W", DoubleType(), True),
])
# hwinfo_schema

In [16]:
# converting the binary shit show to what the it should actually represent
json_data = df_kafka.selectExpr("CAST(value AS STRING)").alias("strings")

In [17]:
df_parsed = json_data.selectExpr("CAST(value AS STRING)")

df_final = df_parsed.withColumn("value", regexp_replace(col("value"), "[\\[\\]']", "")) 
df_final = df_final.select(
    split(col("value"), ",")[0].alias("Date"),
    split(col("value"), ",")[1].alias("Time"),
    split(col("value"), ",")[2].cast("FLOAT").alias("Core_Clocks_avg_MHz"),
    split(col("value"), ",")[3].cast("FLOAT").alias("Ring_LLC_Clock_MHz"),
    split(col("value"), ",")[4].cast("FLOAT").alias("Core_Usage_avg_percent"),
    split(col("value"), ",")[5].cast("FLOAT").alias("Core_Temperatures_avg_C"),
    split(col("value"), ",")[6].cast("FLOAT").alias("CPU_Package_C"),
    split(col("value"), ",")[7].cast("FLOAT").alias("CPU_Package_Power_W")
)

# df_final

In [18]:
# initiating two streaming from here

In [19]:
# stream sink 1 : going directly to elasticsearch for future querying and other random nonsense
# use resource = "hwinfo" for production
# use resource = "hwinfo_test" for testing

es_query = df_final.writeStream \
    .outputMode("append") \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "hwinfo_test") \
    .option("es.net.ssl", "false") \
    .option("checkpointLocation", "/tmp/spark-checkpoints") \
    .start()

# # stream sink 2: used for serving the real time dashboard, stream to a kafka topic that will do all the necessary shit
kafka_query = df_final.selectExpr("to_json(struct(*)) AS value") \
    .writeStream \
    .outputMode("append") \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("topic", "hwinfo_logs_RT") \
    .option("checkpointLocation", "/tmp/spark-checkpoints-RT") \
    .start()

# under testing 

# Write messages to console
# query = df_final.writeStream \
#     .outputMode("append") \
#     .format("console") \
#     .start()


# stream sink 2: used for serving the real time dashboard, stream to a kafka topic that will do all the necessary shit
# kafka_query = df_final.writeStream \
#     .outputMode("append") \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", "kafka:9092") \
#     .option("topic", "hwinfo_logs_RT") \
#     .option("checkpointLocation", "/tmp/spark-checkpoints-RT") \
#     .start()

# the above code should error out since kafka needs value as a message

In [20]:
es_query.stop()
kafka_query.stop()

# es_query = df_final.writeStream \
#     .outputMode("append") \
#     .format("org.elasticsearch.spark.sql") \
#     .option("es.nodes", "elasticsearch") \
#     .option("es.port", "9200") \
#     .option("es.resource", "hwinfo") \
#     .option("es.net.ssl", "false") \
#     .option("checkpointLocation", "/tmp/spark-checkpoints") \
#     .start()