# Select Top N of word count

In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc = spark.sparkContext
topn_rdds = sc.textFile('/data/word_count_test_v1.txt')

# Batch data processing

In [2]:
topn_rdds.take(2)

['Hello', 'Hello']

In [3]:
def batch_pipeline(source):
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .reduceByKey(lambda x, y: (x + y))
        .sortBy(lambda x: x[1],ascending=False)
        .take(3)
    )
    return target

In [4]:
batch_res = batch_pipeline(topn_rdds)
batch_res

[('2', 9), ('9', 7), ('4', 6)]

# Stream Processing

In [5]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 5

queue_rdds = topn_rdds.randomSplit([0.1 for _ in range(N_SPLIT)])

In [6]:
def stream_pipeline(source):
    def updateFunc(new_values, running_value):
        update_value = 0
        if not running_value:
            running_value = 0
        update_value = sum([x for x in new_values]) + running_value
        return update_value
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .reduceByKey(lambda x, y: (x + y))
        .updateStateByKey(updateFunc)
        .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
        .pprint(3)
    )
    return target

### Compute and aggregate word counts over streaming data

In [7]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# init input stream
inputStream = ssc.queueStream(queue_rdds)

# call stream pipeline
out = stream_pipeline(inputStream)

# run streaming for 5 sec.
ssc.start()
time.sleep(5)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

-------------------------------------------
Time: 2021-04-27 04:10:27
-------------------------------------------
('Hello', 1)
('1', 1)
('4', 1)
...

-------------------------------------------
Time: 2021-04-27 04:10:28
-------------------------------------------
('2', 2)
('Team', 2)
('5', 2)
...

-------------------------------------------
Time: 2021-04-27 04:10:29
-------------------------------------------
('2', 5)
('9', 3)
('Team', 3)
...

-------------------------------------------
Time: 2021-04-27 04:10:30
-------------------------------------------
('2', 9)
('1', 5)
('4', 5)
...

-------------------------------------------
Time: 2021-04-27 04:10:31
-------------------------------------------
('2', 9)
('9', 7)
('4', 6)
...

-------------------------------------------
Time: 2021-04-27 04:10:32
-------------------------------------------
('2', 9)
('9', 7)
('4', 6)
...

-------------------------------------------
Time: 2021-04-27 04:10:33
-------------------------------------------
