In [1]:
import findspark
findspark.init()


In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.1.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1 pyspark-shell'


In [3]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, udf, avg, max, window
from pyspark.sql.types import StringType, TimestampType, FloatType

import json
from datetime import datetime

from afinn import Afinn
spark = SparkSession.builder \
                    .master("local[3]") \
                    .config('spark.executor.instances', 3) \
                    .config("spark.sql.catalogImplementation", "hive")\
                    .appName('tweets') \
                    .getOrCreate()
ssc = StreamingContext(spark.sparkContext, 1)


:: loading settings :: url = jar:file:/usr/local/Cellar/apache-spark/3.1.1/libexec/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/yhua/.ivy2/cache
The jars for the packages stored in: /Users/yhua/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5a466847-2ed8-4268-b395-319f02a380e3;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.1.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.1 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.1 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 470ms :: artifacts

In [4]:
spark.sparkContext.setLogLevel("ERROR")

In [5]:
tweetsDfRaw = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "test1") \
  .load()

In [6]:
tweetsDf = tweetsDfRaw.selectExpr("CAST(value AS STRING) as tweet_info")

In [7]:
def extract_tags(word: str):
    if word.lower().startswith("#"):
        return word
    else:
        return "nonTag"

In [8]:
def extract_tweet_text(text):
    tweet_info = json.loads(text)
    return tweet_info["tweet"]

def extract_timestamp(text):
    tweet_info = json.loads(text)
    created_at = tweet_info["created_at"]
    return datetime.strptime(created_at[:-5], '%Y-%m-%dT%H:%M:%S')

In [9]:
afinn = Afinn()

def add_sentiment_score(text):
    sentiment_score = afinn.score(text)
    return sentiment_score

def add_sentiment_status(text):
    sentiment_score = afinn.score(text)

    try:
        if sentiment_score < 0:
            return "negative"
        elif sentiment_score == 0:
            return "neutral"
        else:
            return "positive"

    except TypeError:
        return "error"
        

In [10]:

# extract_tags_udf = udf(extract_tags, StringType())

# resultDf = words.filter(words.word.isNotNull()).withColumn("tags", extract_tags_udf(words.word))

# hashtagCountsDf = resultDf.filter(resultDf.tags != "nonTag")\
#                             .groupBy("tags")\
#                             .count()\
#                             .orderBy("count", ascending=False)

In [11]:
extract_tweet_text_udf = udf(extract_tweet_text, StringType())
extract_timestamp_udf = udf(extract_timestamp, TimestampType())

add_sentiment_score_udf = udf(add_sentiment_score, FloatType())
add_sentiment_status_udf = udf(add_sentiment_status, StringType())

In [12]:
tweetsDf = tweetsDf.withColumn(
    "tweet",
    extract_tweet_text_udf(tweetsDf.tweet_info)
)\
    .withColumn(
        "event_time",
        extract_timestamp_udf(tweetsDf.tweet_info)
    )\
    .withColumn(
        "sentiment_status",
        add_sentiment_status_udf(tweetsDf.tweet_info)
    )\
    .withColumn(
        "sentiment_score",
        add_sentiment_score_udf(tweetsDf.tweet_info)
    )
windowedDf = tweetsDf.groupBy(window(tweetsDf.event_time, "30 seconds", "10 seconds"))\
                     .agg(avg("sentiment_score").alias("avg_sentiment_score"),
                     max("event_time").alias("max_event_time"))

orderedWindowDf = windowedDf.orderBy(windowedDf.window.end)

In [13]:
query = orderedWindowDf.writeStream \
.outputMode("complete") \
.format("console")\
.option("truncate", "false")\
.start()\
.awaitTermination()


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------------------+--------------+
|window|avg_sentiment_score|max_event_time|
+------+-------------------+--------------+
+------+-------------------+--------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------------------+-------------------+
|window                                    |avg_sentiment_score|max_event_time     |
+------------------------------------------+-------------------+-------------------+
|{2022-09-28 20:35:20, 2022-09-28 20:35:50}|0.625              |2022-09-28 20:35:49|
|{2022-09-28 20:35:30, 2022-09-28 20:36:00}|0.8888888888888888 |2022-09-28 20:35:51|
|{2022-09-28 20:35:40, 2022-09-28 20:36:10}|0.8888888888888888 |2022-09-28 20:35:51|
|{2022-09-28 20:35:50, 2022-09-28 20:36:20}|3.0                |2022-09-28 20:35:51|
+------------------------------------------+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-------------------+-------------------+
|window                                    |avg_sentiment_score|max_event_time     |
+------------------------------------------+-------------------+-------------------+
|{2022-09-28 20:35:20, 2022-09-28 20:35:50}|0.625              |2022-09-28 20:35:49|
|{2022-09-28 20:35:30, 2022-09-28 20:36:00}|0.8333333333333334 |2022-09-28 20:35:56|
|{2022-09-28 20:35:40, 2022-09-28 20:36:10}|0.8333333333333334 |2022-09-28 20:35:56|
|{2022-09-28 20:35:50, 2022-09-28 20:36:20}|0.9375             |2022-09-28 20:35:56|
+------------------------------------------+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+-------------------+-------------------+
|window                                    |avg_sentiment_score|max_event_time     |
+------------------------------------------+-------------------+-------------------+
|{2022-09-28 20:35:20, 2022-09-28 20:35:50}|0.625              |2022-09-28 20:35:49|
|{2022-09-28 20:35:30, 2022-09-28 20:36:00}|0.6666666666666666 |2022-09-28 20:35:59|
|{2022-09-28 20:35:40, 2022-09-28 20:36:10}|0.6                |2022-09-28 20:36:01|
|{2022-09-28 20:35:50, 2022-09-28 20:36:20}|0.5909090909090909 |2022-09-28 20:36:01|
|{2022-09-28 20:36:00, 2022-09-28 20:36:30}|0.0                |2022-09-28 20:36:01|
+------------------------------------------+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+---------------------+-------------------+
|window                                    |avg_sentiment_score  |max_event_time     |
+------------------------------------------+---------------------+-------------------+
|{2022-09-28 20:35:20, 2022-09-28 20:35:50}|0.625                |2022-09-28 20:35:49|
|{2022-09-28 20:35:30, 2022-09-28 20:36:00}|0.6666666666666666   |2022-09-28 20:35:59|
|{2022-09-28 20:35:40, 2022-09-28 20:36:10}|0.3541666666666667   |2022-09-28 20:36:08|
|{2022-09-28 20:35:50, 2022-09-28 20:36:20}|0.2857142857142857   |2022-09-28 20:36:11|
|{2022-09-28 20:36:00, 2022-09-28 20:36:30}|-0.043478260869565216|2022-09-28 20:36:11|
|{2022-09-28 20:36:10, 2022-09-28 20:36:40}|0.0                  |2022-09-28 20:36:11|
+------------------------------------------+---------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+--------------------+-------------------+
|window                                    |avg_sentiment_score |max_event_time     |
+------------------------------------------+--------------------+-------------------+
|{2022-09-28 20:35:20, 2022-09-28 20:35:50}|0.625               |2022-09-28 20:35:49|
|{2022-09-28 20:35:30, 2022-09-28 20:36:00}|0.6666666666666666  |2022-09-28 20:35:59|
|{2022-09-28 20:35:40, 2022-09-28 20:36:10}|0.3541666666666667  |2022-09-28 20:36:08|
|{2022-09-28 20:35:50, 2022-09-28 20:36:20}|0.2222222222222222  |2022-09-28 20:36:16|
|{2022-09-28 20:36:00, 2022-09-28 20:36:30}|-0.02857142857142857|2022-09-28 20:36:16|
|{2022-09-28 20:36:10, 2022-09-28 20:36:40}|0.0                 |2022-09-28 20:36:16|
+------------------------------------------+--------------------+-------------------+



KeyboardInterrupt: 