In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab9_9")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093").option("failOnDataLoss", "false") \
        .option("subscribe", "word") \
        .option("startingOffsets", "earliest") \
        .load()

lines = df.selectExpr("CAST(value AS STRING)")

words = lines.select(
        explode(
            split(lines.value, " ")
        ).alias("word")
    )

wordCounts = words.groupBy("word").count()

query = wordCounts \
              .select(concat(col("word"), lit(" "), col("count")).alias("value")) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "kafka1:9093").option("failOnDataLoss", "false") \
              .option("checkpointLocation", "/home/jovyan/checkpoint/wordcount")\
              .option("topic", "wordcount") \
              .outputMode("complete") \
              .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
except:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Unexpected error")
    print("Stoped the streaming query and the spark context")

In [17]:
# Stop the spark context
spark.stop()