In [1]:
import re
import pickle

from datetime import datetime, timezone

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode, split, col, upper, filter, udf, from_json, base64, decode, to_json, struct, encode
from pyspark.sql.types import ArrayType, MapType, StringType, TimestampType, IntegerType, StructType, StructField

In [2]:
spark = SparkSession.builder.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0").master("local").getOrCreate()

In [3]:
def remove_non_letters(s):
    regex = re.compile('[^a-zA-Z]')
    return regex.sub('', s)


remove_non_letters_udf = udf(remove_non_letters)

In [None]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("poster_id", IntegerType()),
    StructField("timestamp", TimestampType()),
    StructField("body", StringType()),
])


KAFKA_HOST = "kafka"
KAFKA_PORT = 9092
KAFKA_TOPIC = "posts"
RESULTS_DIRECTORY_PATH = "/home/jovyan/results"


def unique_words(df, batch_id):
    posts = df.withColumn("json", from_json(decode(col("value"), "utf-8"), schema)).select("json.*")
    result = posts\
    .distinct()\
    .select(explode(split(upper(col("body")), "\s+")))\
    .select(remove_non_letters_udf("col"))\
    .groupby("remove_non_letters(col)")\
    .count().sort(col("count").desc())
    result.show()
    timestamp = datetime.now(tz=timezone.utc).replace(microsecond=0).isoformat()
    result.write.csv(f"{RESULTS_DIRECTORY_PATH}/{timestamp}")
    
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", f"{KAFKA_HOST}:{KAFKA_PORT}") \
  .option("subscribe", KAFKA_TOPIC) \
  .load() 
query = df.writeStream \
    .foreachBatch(unique_words) \
    .trigger(processingTime="1 minutes") \
    .start().awaitTermination()

+-----------------------+-----+
|remove_non_letters(col)|count|
+-----------------------+-----+
+-----------------------+-----+

+-----------------------+-----+
|remove_non_letters(col)|count|
+-----------------------+-----+
|                  VISIT|   20|
|               WHATEVER|   20|
|                  ALLOW|   19|
|               PRESSURE|   19|
|               SOUTHERN|   19|
|             PRODUCTION|   19|
|                 FIGURE|   19|
|                    MAN|   18|
|               ELECTION|   18|
|               BUSINESS|   18|
|                     TV|   18|
|              RECOGNIZE|   17|
|               POLITICS|   17|
|                  PEACE|   17|
|                   TYPE|   17|
|                 GARDEN|   17|
|                  TOTAL|   17|
|                   STOP|   17|
|                  ENJOY|   17|
|                TEACHER|   17|
+-----------------------+-----+
only showing top 20 rows

