In [None]:
from pyspark.streaming import StreamingContext

# Create a StreamingContext(sc stands for SparkContext) with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)#5 is btach size

# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
lines = ssc.socketTextStream('localhost', 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))#sequqnce of rdds

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
#mark 2 streams as outputstreams
lines.pprint()#parallel print(action in rdd is blocking:drive program; output operation is not blocking)
wordCounts.pprint()

#non-blocking(create a new thread)
ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(20)  # Wait for the computation to terminate(blocking: pause the main program for 20 sec)
ssc.stop(stopSparkContext=False)  # Stop the StreamingContext without stopping the SparkContext(don't want sc to be deleted: cannot rerun)

print("Finished")

In [1]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rdd = sc.textFile('adj_noun_pairs.txt', 8)

# split the rdd into 5 equal-size parts
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
        
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Feed the rdd queue to a DStream
lines = ssc.queueStream(rddQueue)

# Do word-counting as before
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Use transform() to access any rdd transformations not directly available in SparkStreaming
topWords = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
topWords.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(25)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

                                                                                

-------------------------------------------
Time: 2023-05-17 16:59:45
-------------------------------------------
('other', 15486)
('first', 10815)
('many', 9773)
('new', 6272)
('system', 5063)
('american', 4744)
('several', 4545)
('century', 4492)
('same', 4394)
('=', 4313)
...



                                                                                

-------------------------------------------
Time: 2023-05-17 16:59:50
-------------------------------------------
('other', 15319)
('first', 10709)
('many', 9575)
('new', 6242)
('system', 5111)
('american', 4777)
('century', 4538)
('several', 4515)
('same', 4453)
('=', 4361)
...



                                                                                

-------------------------------------------
Time: 2023-05-17 16:59:55
-------------------------------------------
('other', 15346)
('first', 10517)
('many', 9706)
('new', 6218)
('system', 5266)
('american', 4940)
('several', 4615)
('=', 4451)
('century', 4438)
('same', 4270)
...



                                                                                

-------------------------------------------
Time: 2023-05-17 17:00:00
-------------------------------------------
('other', 15196)
('first', 10617)
('many', 9848)
('new', 6289)
('system', 5093)
('american', 4824)
('several', 4519)
('century', 4493)
('=', 4332)
('same', 4260)
...



                                                                                

-------------------------------------------
Time: 2023-05-17 17:00:05
-------------------------------------------
('other', 15091)
('first', 10463)
('many', 9703)
('new', 6175)
('system', 5102)
('american', 4829)
('several', 4523)
('century', 4520)
('=', 4369)
('same', 4325)
...

Finished


In [2]:
rdd = sc.textFile('adj_noun_pairs.txt', 8)
rdd.take(10)

['early radical',
 'french revolution',
 'pejorative way',
 'violent means',
 'positive label',
 'self-defined anarchist',
 'political philosophy',
 'differ interpretation',
 'relate movement',
 'social movement']

In [1]:
# Find the most positive words in windows of 5 seconds from streaming data

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

#do word count on Dstream
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda t:
                                 (t[1][0] * t[1][1], t[0])) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

                                                                                

-------------------------------------------
Time: 2023-04-25 23:43:45
-------------------------------------------
(7890.0, 'great')
(6180.0, 'popular')
(5544.0, 'best')
(4662.0, 'good')
(4242.0, 'important')
(2340.0, 'strong')
(2322.0, 'greater')
(2058.0, 'successful')
(1850.0, 'novel')
(1790.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-25 23:43:50
-------------------------------------------
(7578.0, 'great')
(6126.0, 'popular')
(5604.0, 'best')
(4749.0, 'good')
(4172.0, 'important')
(2253.0, 'greater')
(2252.0, 'strong')
(2001.0, 'successful')
(1948.0, 'novel')
(1804.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-25 23:43:55
-------------------------------------------
(7893.0, 'great')
(6009.0, 'popular')
(5574.0, 'best')
(4677.0, 'good')
(4298.0, 'important')
(2326.0, 'strong')
(2172.0, 'greater')
(1944.0, 'successful')
(1868.0, 'novel')
(1761.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-25 23:44:00
-------------------------------------------
(7776.0, 'great')
(5925.0, 'popular')
(5538.0, 'best')
(4671.0, 'good')
(4156.0, 'important')
(2362.0, 'strong')
(2205.0, 'greater')
(1932.0, 'successful')
(1892.0, 'novel')
(1803.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-25 23:44:05
-------------------------------------------
(7566.0, 'great')
(5916.0, 'popular')
(5334.0, 'best')
(4548.0, 'good')
(4268.0, 'important')
(2415.0, 'greater')
(2314.0, 'strong')
(2109.0, 'successful')
(2018.0, 'novel')
(1821.0, 'natural')
...

Finished


In [1]:
from pyspark.streaming import StreamingContext

ssc = StreamingContext(sc, 5)#5 is batch size
# Provide a checkpointing directory. Required for stateful transformations
ssc.checkpoint("checkpoint")

numPartitions = 8
rdd = sc.textFile('adj_noun_pairs.txt', numPartitions)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunction(newValues,runningValue):
    #newavlues:list of strings, runningValue: current longest string
    if runningValue is None:
        runningValue = ""
    for value in newValues:#running values is empty/non empty
        if len(value)>len(runningValue):
            runningValue = value
    return runningValue

#clean the data
cleaned_line = lines.map(lambda line: line.split()).filter(lambda line:len(line)==2)
stateDstream = cleaned_line.updateStateByKey(updateFunction)#key is adj: same keys are updated together

# Print the results
stateDstream.foreachRDD(lambda rdd: print(rdd.take(5)))
stateDstream.foreachRDD(lambda rdd: print(rdd.lookup("social")[0]))

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

23/05/17 19:42:49 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:42:50 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'constructionism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'resistance'), ('earthly', 'consideration')]
constructionism
23/05/17 19:42:53 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:42:55 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'constructionism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'communication'), ('earthly', 'consideration')]
constructionism
23/05/17 19:42:57 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:00 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'constructionism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'communication'), ('earthly', 'consideration')]
constructionism
23/05/17 19:43:02 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:05 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'constructionism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'communication'), ('earthly', 'consideration')]
constructionism
23/05/17 19:43:07 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:10 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'communication'), ('earthly', 'consideration')]




realism/constructivism
23/05/17 19:43:12 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:15 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'socialist-revolutionary'), ('nonviolent', 'communication'), ('earthly', 'consideration')]
realism/constructivism
23/05/17 19:43:17 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:20 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'patriarchates/patriarchs'), ('nonviolent', 'communication'), ('earthly', 'consideration')]




realism/constructivism
23/05/17 19:43:22 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:25 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'entertainers-turned-governor'), ('nonviolent', 'segregationist'), ('earthly', 'consideration')]
realism/constructivism
23/05/17 19:43:27 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:30 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'entertainers-turned-governor'), ('nonviolent', 'segregationist'), ('earthly', 'consideration')]




realism/constructivism
23/05/17 19:43:32 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:35 WARN QueueInputDStream: queueStream doesn't support checkpointing


                                                                                

[('social', 'realism/constructivism'), ('desirable', 'characteristic'), ('other', 'entertainers-turned-governor'), ('nonviolent', 'segregationist'), ('earthly', 'consideration')]
realism/constructivism
23/05/17 19:43:37 WARN QueueInputDStream: queueStream doesn't support checkpointing
23/05/17 19:43:39 WARN BatchedWriteAheadLog: BatchedWriteAheadLog Writer queue interrupted.
Finished


In [None]:
from pyspark.streaming import StreamingContext

# Stateful word count

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations(restore state from previous time)
ssc.checkpoint("checkpoint")

rdd = sc.textFile('adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)#10 batches, each batch is smaller
lines = ssc.queueStream(rddQueue)

#newValues a list of size 1 for word count; runningCount is current count
#if key doesn't exist in the state
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)#add new count to the running count; 
    #sum(list, addition value added to the final sum)
    # add the new values with the previous running count to get the new count

#other rdd(after every batch, therer is a new state)(how the state is updated after each batch, initial is empty)
running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .updateStateByKey(updateFunc)#same keys are grouped together

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    print("Total distinct words: ", rdd.count())
    print(rdd.take(5))
    print('refinery:', rdd.lookup('refinery')[0])

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

In [None]:
# MG algorithm for approximate word count

from pyspark.streaming import StreamingContext

k = 10000
threshold = 0
total_decrement = 0

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    newValue = sum(newValues, runningCount) - threshold#reduce the size of the state
    return newValue if newValue > 0 else None
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)#stateful wordcount as before
            
counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))# sort it based on the counts

def printResults(rdd):
    global threshold, total_decrement 
    rdd.cache()
    print("Total distinct words: ", rdd.count())
    print(rdd.map(lambda x: (x[0], x[1], x[1]+total_decrement)).take(5))#lower bound, upper bound[x[1], x[1]+total_decrement]
    lower_bound = rdd.lookup('refinery')
    if len(lower_bound) > 0:
        lower_bound = lower_bound[0]
    else:
        lower_bound = 0
    print('refinery:', lower_bound, ',', lower_bound + total_decrement)
    if rdd.count() > k:#
        threshold = rdd.zipWithIndex().map(lambda x: (x[1], x[0])).lookup(k)[0][1]#index will be the value, swap key value(index, count)max_count become new threshold
    else:
        threhold = 0
    print("Next threshold = ", threshold)
    total_decrement += threshold
    rdd.unpersist()

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

In [None]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rddQueue = []
for i in range(5):
    rdd = sc.parallelize([i, i, i, i, i])
    rddQueue += [rdd]
        
# Create a StreamingContext with batch interval of 3 seconds
ssc = StreamingContext(sc, 3)

ssc.checkpoint("checkpoint")

# Feed the rdd queue to a DStream
nums = ssc.queueStream(rddQueue)

# Compute the sum over a sliding window of 9 seconds for every 3 seconds
# slidingSum = nums.reduceByWindow(lambda x, y: x + y, None, 9, 3)
slidingSum = nums.reduceByWindow(lambda x, y: x + y, lambda x, y: x - y, 9, 3) #function itself and inverse function;
# sliding window size 3;wondow length is 9

slidingSum.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(24)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

In [21]:
# Word count using structured streaming: Complete mode vs update mode
#spark streaming is incorporated with spark sql
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows(#covert multiple columns into rows) 
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

word_counts = words.groupBy('word').count()#very simple compared with previous task

# Start running the query
#.option('truncate', 'false')#default truncate to 20 rows\
#.trigger(processingTime='5 seconds') \#trigger every 5 seconds; it's complete mode
query = word_counts\
        .writeStream\
        .outputMode('update')\
        .format('console')\
        .option('truncate', 'false')\
        .start()
#.trigger(processingTime='5 seconds') \
# try update mode; append mode not supported

query.awaitTermination(25)
query.stop()
print("Finished")

23/04/26 10:57:17 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
23/04/26 10:57:17 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/h0/cr6ykp052gngl6yqd4mtpvtm0000gn/T/temporary-a70b4ed2-ecd1-4ca4-87f8-abc0bf9def01. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/26 10:57:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+-----+
|word |count|
+-----+-----+
|happy|1    |
+-----+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+-----+
|word |count|
+-----+-----+
|happy|2    |
|best |1    |
+-----+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
|best|2    |
+----+-----+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-----+-----+
|word |count|
+-----+-----+
|happy|3    |
+-----+-----+

Finished


In [None]:
# Append mode with selection condition
# Note: complete mode not supported if no aggregation

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

long_words = words.filter(length(words['word'])>=3)

# Start running the query 
query = long_words\
        .writeStream\
        .outputMode('update')\#cannot use append mode here
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

In [None]:
rdd = sc.textFile('../data/adj_noun_pairs.txt', numPartitions)

In [None]:
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\#keep timestamp
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

windowedCounts = words.groupBy(
    window(words.timestamp, "10 seconds", "5 seconds"),
    words.word)\
    .count()

# Start running the query 
query = windowedCounts\
        .writeStream\
        .outputMode('update')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

23/04/25 23:51:25 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
DataFrame[word: string, timestamp: timestamp, word: string, sentiment: double]


AnalysisException: Reference 'word' is ambiguous, could be: word, word.

In [None]:
# Rate source (for testing) - Generates data at the specified number of rows per second, 
# each output row contains a timestamp and value. Where timestamp is a Timestamp type 
# containing the time of message dispatch, and value is of Long type containing the message count, 
# starting from 0 as the first row. This source is intended for testing and benchmarking.

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()
        
# Start running the query 
query = streamingDf\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

In [None]:
# Stream–Static Joins

from pyspark.sql.functions import *

staticDf = spark.createDataFrame([(1, 'apple'), (2, 'orange'), (10, 'banana')], ['id', 'name'])

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

# Start running the query 
query = streamingDf.join(staticDf, streamingDf['value'] == staticDf['id'])\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()
# also support leftOuter, but not rightOuter

query.awaitTermination(25)
query.stop()
print("Finished")

In [None]:
# Stream-stream Joins
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.join(streamingDf2, 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

In [None]:
# Stream–stream Joins with watemarking
spark.conf.set('spark.sql.shuffle.partitions', 4) 

from pyspark.sql.functions import *

streamingDf = spark\
        .readStream\
        .format('rate')\
        .load()

streamingDf2 = spark\
        .readStream\
        .format('rate')\
        .option('rowsPerSecond', 2) \
        .load()

# Start running the query 
query = streamingDf.withWatermark('timestamp', '3 seconds').join(streamingDf2.withWatermark('timestamp', '3 seconds'), 'value')\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(20)
query.stop()
print("Finished")

#The current watermark is computed by looking at the MAX(eventTime) seen across all of the partitions in 
#the query minus a user specified delayThreshold. Due to the cost of coordinating this value across partitions,
#the actual watermark used is only guaranteed to be at least delayThreshold behind the actual event time. 
#In some cases we may still process records that arrive more than delayThreshold late.

In [1]:
from pyspark.sql.functions import *

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = spark.createDataFrame(sc.textFile("AFINN-111.txt") 
                    .map(parse_line), ['word', 'sentiment']).cache()

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()

# Split the lines into words and explode the array
words = lines.select(explode(split(lines.value, ' ')).alias('word'), lines.timestamp)

# Join the words with their sentiment scores
sentiment_scores = words.join(word_sentiments, words.word == word_sentiments.word)\
                .select(words.word, words.timestamp,word_sentiments.sentiment)

print(sentiment_scores)

# Compute the sentiment score for each window
windowed_sentiment_scores = sentiment_scores.groupBy(window(sentiment_scores.timestamp, '10 seconds', '5 seconds'),\
                                                     sentiment_scores.word)\
                                            .agg(sum(sentiment_scores.sentiment).alias('score'))\
                                            .orderBy('score', ascending=False)

# Output the results in complete mode
query = windowed_sentiment_scores.writeStream\
                                 .outputMode('complete')\
                                 .format('console')\
                                 .start()

query.awaitTermination(25)
query.stop()
print("Finished")

                                                                                

23/04/26 10:30:24 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
DataFrame[word: string, timestamp: timestamp, sentiment: double]


AnalysisException:  Column window#39 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via `Dataset.as` before joining them, and specify the column using qualified name, e.g. `df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.        

In [28]:
from pyspark.sql.functions import *

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = spark.createDataFrame(sc.textFile("AFINN-111.txt") 
                    .map(parse_line), ['word', 'sentiment']).cache()

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()

# Split the lines into words and explode the array
words = lines.select(explode(split(lines.value, ' ')).alias('word'), lines.timestamp)

happy_words = words.groupBy(window(words.timestamp,"10 seconds", "5 seconds"),
                           words.word)\
                           .count()\
                           .join(word_sentiments, words.word == word_sentiments.word)\
                           .select(col("window"), words.word, col("count"),col("sentiment"))\
                           .withColumn('sentiment', col("count")*col("sentiment"))\
                           .orderBy(col("sentiment").desc())

# Output the results in complete mode
query = happy_words.writeStream\
                    .outputMode('complete')\
                    .format('console')\
                    .option('truncate','false')\
                    .trigger(processingTime='5 seconds')\
                    .start()

query.awaitTermination(25)
query.stop()
print("Finished")

23/04/26 11:14:56 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
23/04/26 11:14:56 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/h0/cr6ykp052gngl6yqd4mtpvtm0000gn/T/temporary-f4c45395-daa8-4440-b58a-d7e0f4ba463e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/26 11:14:56 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+----+-----+---------+
|window|word|count|sentiment|
+------+----+-----+---------+
+------+----+-----+---------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------+-----+---------+
|window                                    |word   |count|sentiment|
+------------------------------------------+-------+-----+---------+
|{2023-04-26 11:15:05, 2023-04-26 11:15:15}|happy  |2    |6.0      |
|{2023-04-26 11:15:00, 2023-04-26 11:15:10}|happy  |2    |6.0      |
|{2023-04-26 11:15:05, 2023-04-26 11:15:15}|popular|1    |3.0      |
|{2023-04-26 11:15:00, 2023-04-26 11:15:10}|popular|1    |3.0      |
+------------------------------------------+-------+-----+---------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-------+-----+---------+
|window                                    |word   |count|sentiment|
+------------------------------------------+-------+-----+---------+
|{2023-04-26 11:15:05, 2023-04-26 11:15:15}|happy  |3    |9.0      |
|{2023-04-26 11:15:00, 2023-04-26 11:15:10}|happy  |2    |6.0      |
|{2023-04-26 11:15:05, 2023-04-26 11:15:15}|popular|1    |3.0      |
|{2023-04-26 11:15:00, 2023-04-26 11:15:10}|popular|1    |3.0      |
|{2023-04-26 11:15:10, 2023-04-26 11:15:20}|happy  |1    |3.0      |
+------------------------------------------+-------+-----+---------+

Finished
23/04/26 12:09:58 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1410444 ms exceeds timeout 120000 ms
23/04/26 12:09:58 WARN SparkContext: Killing executors is not supported by current scheduler.


In [15]:
# Find the most positive words in windows of 5 seconds from streaming data

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

#do word count on Dstream
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda t:
                                 (t[1][0] * t[1][1], t[0])) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

                                                                                

-------------------------------------------
Time: 2023-04-26 10:51:25
-------------------------------------------
(7890.0, 'great')
(6180.0, 'popular')
(5544.0, 'best')
(4662.0, 'good')
(4242.0, 'important')
(2340.0, 'strong')
(2322.0, 'greater')
(2058.0, 'successful')
(1850.0, 'novel')
(1790.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-26 10:51:30
-------------------------------------------
(7578.0, 'great')
(6126.0, 'popular')
(5604.0, 'best')
(4749.0, 'good')
(4172.0, 'important')
(2253.0, 'greater')
(2252.0, 'strong')
(2001.0, 'successful')
(1948.0, 'novel')
(1804.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-26 10:51:35
-------------------------------------------
(7893.0, 'great')
(6009.0, 'popular')
(5574.0, 'best')
(4677.0, 'good')
(4298.0, 'important')
(2326.0, 'strong')
(2172.0, 'greater')
(1944.0, 'successful')
(1868.0, 'novel')
(1761.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-26 10:51:40
-------------------------------------------
(7776.0, 'great')
(5925.0, 'popular')
(5538.0, 'best')
(4671.0, 'good')
(4156.0, 'important')
(2362.0, 'strong')
(2205.0, 'greater')
(1932.0, 'successful')
(1892.0, 'novel')
(1803.0, 'natural')
...



                                                                                

-------------------------------------------
Time: 2023-04-26 10:51:45
-------------------------------------------
(7566.0, 'great')
(5916.0, 'popular')
(5334.0, 'best')
(4548.0, 'good')
(4268.0, 'important')
(2415.0, 'greater')
(2314.0, 'strong')
(2109.0, 'successful')
(2018.0, 'novel')
(1821.0, 'natural')
...

Finished


In [None]:
word_sentiments = spark.createDataFrame(sc.textFile("../data/AFINN-111.txt") 
                    .map(parse_line), ['word', 'sentiment']).cache()
