In [1]:
# spark + kafka + ES streaming application 1
# goal: connect spark, kafka and ES and transfer logs
# kafka - data streaming from HWinfo
# spark - process data coming from kafka
# ES - store the processed logs for future retrieval

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, DoubleType, StringType, IntegerType, StructField
from pyspark.sql.functions import col, split, trim

In [2]:
spark = SparkSession.builder \
    .appName("KafkaToElasticsearchHwinfoLogs") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,org.elasticsearch:elasticsearch-spark-30_2.12:8.10.1") \
    .getOrCreate()

In [3]:
spark

In [4]:
df_kafka = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "hwinfo") \
    .option("startingOffsets", "latest") \
    .option("failOnDataLoss", "false") \
    .load()


In [5]:
hwinfo_schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Core_Clocks_avg_MHz", IntegerType(), True),
    StructField("Bus_Clock_MHz", DoubleType(), True),
    StructField("Core_Usage_avg_percent", DoubleType(), True),
    StructField("Core_Temperatures_avg_C", DoubleType(), True),
    StructField("CPU_Package_C", IntegerType(), True),
    StructField("CPU_Package_Power_W", DoubleType(), True),
])
# hwinfo_schema

In [6]:
# converting the binary shit show to what the it should actually represent
json_data = df_kafka.selectExpr("CAST(value AS STRING)").alias("strings")

In [None]:
from pyspark.sql.functions import split, col

# Convert Kafka value from bytes to string
df_parsed = json_data.selectExpr("CAST(value AS STRING)")

# Split CSV data into individual fields based on your schema
# df_final = df_parsed.select(
#     split(col("value"), ",")[0].alias("Date"),
#     split(col("value"), ",")[1].alias("Time"),
#     split(col("value"), ",")[2].alias("Core_Clocks_avg_MHz"),
#     split(col("value"), ",")[3].alias("Ring_LLC_Clock_MHz"),
#     split(col("value"), ",")[4].alias("Core_Usage_avg_percent"),
#     split(col("value"), ",")[5].alias("Core_Temperatures_avg_C"),
#     split(col("value"), ",")[6].alias("CPU_Package_C"),
#     split(col("value"), ",")[7].alias("CPU_Package_Power_W"),
# )
# .cast("float")
# df_final = df_parsed.select(
#     split(col("value"), ",")[0].alias("Date"),
#     split(col("value"), ",")[1].alias("Time"),
#     trim(split(col("value"), ",")[2]).cast("Float").alias("Core_Clocks_avg_MHz"),
#     trim(split(col("value"), ",")[3]).cast("Float").alias("Ring_LLC_Clock_MHz"),
#     trim(split(col("value"), ",")[4]).cast("Float").alias("Core_Usage_avg_percent"),
#     trim(split(col("value"), ",")[5]).cast("Float").alias("Core_Temperatures_avg_C"),
#     trim(split(col("value"), ",")[6]).cast("Float").alias("CPU_Package_C"),
#     trim(split(col("value"), ",")[7]).cast("Float").alias("CPU_Package_Power_W"),
# )

from pyspark.sql.functions import regexp_replace

df_final = df_parsed.withColumn("value", regexp_replace(col("value"), "[\\[\\]']", "")) 
df_final = df_final.select(
    split(col("value"), ",")[0].alias("Date"),
    split(col("value"), ",")[1].alias("Time"),
    split(col("value"), ",")[2].cast("FLOAT").alias("Core_Clocks_avg_MHz"),
    split(col("value"), ",")[3].cast("FLOAT").alias("Ring_LLC_Clock_MHz"),
    split(col("value"), ",")[4].cast("FLOAT").alias("Core_Usage_avg_percent"),
    split(col("value"), ",")[5].cast("FLOAT").alias("Core_Temperatures_avg_C"),
    split(col("value"), ",")[6].cast("FLOAT").alias("CPU_Package_C"),
    split(col("value"), ",")[7].cast("FLOAT").alias("CPU_Package_Power_W")
)


# df_final

In [None]:
es_query = df_final.writeStream \
    .outputMode("append") \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "hwinfo") \
    .option("es.net.ssl", "false") \
    .option("checkpointLocation", "/tmp/spark-checkpoints") \
    .start()

In [None]:
# df_final.writeStream \
#     .format("org.elasticsearch.spark.sql") \
#     .option("es.nodes", "elasticsearch") \
#     .option("es.port", "9200") \
#     .option("es.resource", "hwinfo") \
#     .option("es.net.ssl", "false") \
#     .option("checkpointLocation", "/tmp/spark-checkpoints") \
#     .start()

In [11]:
# query = df_final.writeStream.outputMode("append").format("console").start()
# query = df_parsed.writeStream.outputMode("append").format("console").start()