# Select Top N of word count

In [None]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc = spark.sparkContext
topn_rdds = sc.textFile('/data/test_text.txt')

In [2]:
# show data
topn_rdds.collect()

['Hello',
 'Hello',
 'Hello',
 'Hello',
 'Team',
 'Team',
 'Team',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2']

# Batch data processing

In [3]:
topn_rdds.take(2)

['Hello', 'Hello']

In [4]:
def batch_pipeline(source,n):
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .reduceByKey(lambda x, y: x + y)
        .sortBy(lambda x: x[1],ascending=False)
        .take(n)
    )
    return target

In [5]:
batch_res = batch_pipeline(topn_rdds,3)
batch_res

[('2', 7), ('1', 4), ('Hello', 4)]

# Stream Processing

In [6]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 5

queue_rdds = topn_rdds.randomSplit([0.1 for _ in range(N_SPLIT)])

In [7]:
def stream_pipeline(source,n):
    def updateFunc(new_values, running_value):
        update_value = 0
        if not running_value:
            running_value = 0
        update_value = sum([x for x in new_values]) + running_value
        return update_value
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .reduceByKey(lambda x, y: x + y)
        .updateStateByKey(updateFunc)
        .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
        
    )
    return target

### Compute and aggregate word counts over streaming data

In [8]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# init input stream
inputStream = ssc.queueStream(queue_rdds)

# call stream pipeline
out = stream_pipeline(inputStream,3)

# log
n = 3

out.pprint(n)

# run streaming for 5 sec.
ssc.start()
time.sleep(5)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

-------------------------------------------
Time: 2021-04-27 01:14:47
-------------------------------------------
('Hello', 2)
('Team', 1)

-------------------------------------------
Time: 2021-04-27 01:14:48
-------------------------------------------
('Hello', 3)
('2', 3)
('Team', 2)

-------------------------------------------
Time: 2021-04-27 01:14:49
-------------------------------------------
('2', 4)
('Hello', 3)
('Team', 2)

-------------------------------------------
Time: 2021-04-27 01:14:50
-------------------------------------------
('2', 5)
('Hello', 4)
('Team', 2)
...

-------------------------------------------
Time: 2021-04-27 01:14:51
-------------------------------------------
('2', 7)
('Hello', 4)
('1', 4)
...

-------------------------------------------
Time: 2021-04-27 01:14:52
-------------------------------------------
('2', 7)
('Hello', 4)
('1', 4)
...

-------------------------------------------
Time: 2021-04-27 01:14:53
---------------------------------------