In [1]:
import re
import pickle

from datetime import datetime, timezone

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import explode, split, col, upper, filter, udf, from_json, base64, decode, to_json, struct, encode
from pyspark.sql.types import ArrayType, MapType, StringType, TimestampType, IntegerType, StructType, StructField

In [2]:
spark = SparkSession.builder.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0").master("local").getOrCreate()

with open("sentiment_tagger.pkl", "rb") as f:
    model = pickle.load(f)
    spark.sparkContext.broadcast(model)


def tag_sentiment(s):
    return "POSITIVE" if model.predict([s]) == 1 else "NEGATIVE"


tag_sentiment_udf = udf(tag_sentiment)

In [None]:
schema = StructType([
    StructField("id", IntegerType()),
    StructField("poster_id", IntegerType()),
    StructField("timestamp", TimestampType()),
    StructField("body", StringType()),
])


KAFKA_HOST = "kafka"
KAFKA_PORT = 9092
INPUT_KAFKA_TOPIC = "posts"
OUTPUT_KAFKA_TOPIC = "sentiments"

df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", f"{KAFKA_HOST}:{KAFKA_PORT}") \
    .option("subscribe", INPUT_KAFKA_TOPIC) \
    .load() 
query = df \
    .withColumn("json", from_json(decode(col("value"), "utf-8"), schema)).select("json.*") \
    .distinct()\
    .withColumn("sentiment", tag_sentiment_udf("body"))\
    .select("id", "sentiment") \
    .withColumn("value", to_json(struct(col("*")))) \
    .withColumn("key", col("id")) \
    .withColumn("value", encode(col("value"), "utf-8").cast("binary"))\
    .withColumn("key", encode(col("key"), "utf-8").cast("binary"))\
    .select("key", "value") \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", f"{KAFKA_HOST}:{KAFKA_PORT}") \
    .option("topic", OUTPUT_KAFKA_TOPIC) \
    .option("checkpointLocation", "/tmp/checkpoint")\
    .trigger(processingTime="1 minutes") \
    .start().awaitTermination()