# Compute the Average of Pseudo Streaming Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc = spark.sparkContext
ticket_flights = sc.textFile('/data/ticket_flights.csv')

## Batch Data Processing from HW3
* In the order of Key: (avg, count)

In [2]:
def batch_pipeline(source):
    target = (
        source.map(lambda x: (x.split(",")[2], float(x.split(",")[3])))
        .mapValues(lambda x: (x, 1))
        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
        .mapValues(lambda x: (x[0] / x[1], x[1]))
    )
    return target

In [3]:
batch_res = batch_pipeline(ticket_flights)
batch_res.collect()

[('Comfort', (32740.552888786075, 17291)),
 ('Economy', (15959.813334810322, 920793)),
 ('Business', (51143.416138681925, 107642))]

## Streaming Data Processing

### Generate Pseudo Streaming Dataset (Queue of RDDs)

* Randomly split the original rdd into a list of N_SPLIT rdds as the pseudo input for streaming data.
* Each of the RDD can be considered as a single batch data at certain timestamp in the stream.

In [4]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 5

queue_rdds = ticket_flights.randomSplit([0.1 for _ in range(N_SPLIT)])

* new_values: list of tuple
* running_value: tuple

In [5]:
def stream_pipeline(source):
    def updateFunc(new_values, running_value):
        update_values = [0, 0]
        if not running_value:
            running_value = [0, 0]
        
        update_values[0] = sum([x[0] for x in new_values]) +  running_value[0]
        update_values[1] = sum([x[1] for x in new_values]) + running_value[1]
        return update_values

    target = (
        source.map(lambda x: (x.split(",")[2], float(x.split(",")[3])))
        .mapValues(lambda x: (x, 1))
        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
        .updateStateByKey(updateFunc)
        .mapValues(lambda x: (x[0] / x[1], x[1]))
    )
    return target

### Compute and aggregate the average over streaming data
* In the order of Key: (avg, count)

In [6]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# init input stream
inputStream = ssc.queueStream(queue_rdds)

# call stream pipeline
out = stream_pipeline(inputStream)

# log
out.pprint()

# run streaming for 5 sec.
ssc.start()
time.sleep(5)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

-------------------------------------------
Time: 2021-04-26 20:37:33
-------------------------------------------
('Economy', (15966.371902668094, 183989))
('Comfort', (32836.21082621082, 3510))
('Business', (51092.557056357706, 21470))

-------------------------------------------
Time: 2021-04-26 20:37:34
-------------------------------------------
('Economy', (15953.35282899609, 368629))
('Comfort', (32715.675831294084, 6947))
('Business', (51088.21659007566, 43086))

-------------------------------------------
Time: 2021-04-26 20:37:35
-------------------------------------------
('Economy', (15976.10286393242, 552108))
('Comfort', (32681.605671584595, 10438))
('Business', (51333.27563557685, 64705))

-------------------------------------------
Time: 2021-04-26 20:37:36
-------------------------------------------
('Economy', (15968.0647673737, 736976))
('Comfort', (32722.388275214787, 13851))
('Business', (51302.10567905825, 86053))

-------------------------------------------
Time: 

```
[('Economy', (15959.813334810322, 920793)),
 ('Comfort', (32740.552888786075, 17291)),
 ('Business', (51143.416138681925, 107642))]
```