# Compute the Average of Pseudo Streaming Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc = spark.sparkContext
wc_rdds = sc.textFile('/data/word_count_test_v1.txt')

In [2]:
# show data
wc_rdds.collect()

['Hello',
 'Hello',
 'Hello',
 'Hello',
 'Team',
 'Team',
 'Team',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '3',
 '3',
 '3',
 '3',
 '3',
 '4',
 '4',
 '4',
 '4',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '9',
 '9',
 '9',
 '9',
 '9',
 '9',
 '1',
 '2',
 '4',
 '3',
 '7',
 '8',
 '9',
 '2']

## Batch Data Processing Pipeline
* In the order of Key: Count

In [3]:
def batch_pipeline(source):
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .reduceByKey(lambda x, y: x + y)
    )
    return target

In [4]:
batch_res = batch_pipeline(wc_rdds)
batch_res.collect()

[('Hello', 4),
 ('1', 5),
 ('4', 6),
 ('9', 7),
 ('8', 1),
 ('3', 6),
 ('5', 6),
 ('2', 9),
 ('7', 1),
 ('Team', 3)]

## Streaming Data Processing

### Generate Pseudo Streaming Dataset (Queue of RDDs)

* Randomly split the original rdd into a list of N_SPLIT rdds as the pseudo input for streaming data.
* Each of the RDD can be considered as a single batch data at certain timestamp in the stream.

In [5]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 5

queue_rdds = wc_rdds.randomSplit([0.1 for _ in range(N_SPLIT)])

In [6]:
def stream_pipeline(source):
    def updateFunc(new_values, running_value):
        update_value = 0
        if not running_value:
            running_value = 0
        update_value = sum([x for x in new_values]) + running_value
        return update_value
    target = (
        source.flatMap(lambda line: line.split(" "))
        .map(lambda word: (word, 1))
        .updateStateByKey(updateFunc)
    )
    return target

### Compute and aggregate word counts over streaming data

In [7]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# init input stream
inputStream = ssc.queueStream(queue_rdds)

# call stream pipeline
out = stream_pipeline(inputStream)

# log
out.pprint()

# run streaming for 5 sec.
ssc.start()
time.sleep(5)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

-------------------------------------------
Time: 2021-04-26 20:39:28
-------------------------------------------
('4', 3)
('9', 2)
('Hello', 1)
('1', 1)
('Team', 1)
('2', 3)
('3', 2)
('5', 2)
('7', 1)

-------------------------------------------
Time: 2021-04-26 20:39:29
-------------------------------------------
('4', 4)
('9', 5)
('Hello', 2)
('1', 1)
('Team', 1)
('2', 6)
('3', 3)
('5', 3)
('7', 1)

-------------------------------------------
Time: 2021-04-26 20:39:30
-------------------------------------------
('4', 5)
('9', 6)
('Hello', 3)
('1', 2)
('Team', 1)
('2', 8)
('3', 4)
('5', 3)
('7', 1)

-------------------------------------------
Time: 2021-04-26 20:39:31
-------------------------------------------
('4', 6)
('9', 6)
('Hello', 3)
('1', 3)
('8', 1)
('Team', 2)
('2', 9)
('3', 4)
('5', 6)
('7', 1)

-------------------------------------------
Time: 2021-04-26 20:39:32
-------------------------------------------
('4', 6)
('9', 7)
('Hello', 4)
('1', 5)
('8', 1)
('Team', 3)
('2'

```
[GT]
[('Hello', 4),
 ('1', 5),
 ('4', 6),
 ('9', 7),
 ('8', 1),
 ('3', 6),
 ('5', 6),
 ('2', 9),
 ('7', 1),
 ('Team', 3)]
```