In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract
import os

In [3]:
# Path to the directory containing JAR files
jars_dir = "./"

# Collect all JAR files in the directory
jars = ",".join([os.path.join(jars_dir, f) for f in os.listdir(jars_dir) if f.endswith(".jar")])


In [4]:
spark = SparkSession.builder \
    .appName("KafkaLogProcessor") \
    .config("spark.jars", jars) \
    .getOrCreate()

In [5]:
kafka_brokers = "localhost:9092"  
kafka_topic = "data1"  

In [6]:
raw_logs_df = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_brokers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

In [7]:
logs_df = raw_logs_df.selectExpr("CAST(value AS STRING) AS text")

In [8]:
es_options = {
  "es.nodes": "localhost",  
  "es.port": "9200",        
  "es.index.auto.create": "true",  
  "es.resource": "logs"  
}

In [9]:
logs_df.write \
    .format("org.elasticsearch.spark.sql") \
    .options(**es_options) \
    .mode("append") \
    .save()

In [10]:
spark.stop() 