# Compute the Average of Pseudo Streaming Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc = spark.sparkContext
ticket_flights = sc.textFile('/data/ticket_flights.csv')

## Batch Data Processing from HW3
* In the order of Key: (avg, count)

In [2]:
def batch_pipeline(source):
    target = (
        source.map(lambda x: (x.split(",")[2], float(x.split(",")[3])))
        .mapValues(lambda x: (x, 1))
        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
        .mapValues(lambda x: (x[0] / x[1], x[1]))
    )
    return target

In [3]:
batch_res = batch_pipeline(ticket_flights)
batch_res.collect()

[('Economy', (15959.813334810322, 920793)),
 ('Comfort', (32740.552888786075, 17291)),
 ('Business', (51143.416138681925, 107642))]

## Streaming Data Processing

### Generate Pseudo Streaming Dataset (Queue of RDDs)

* Randomly split the original rdd into a list of N_SPLIT rdds as the pseudo input for streaming data.
* Each of the RDD can be considered as a single batch data at certain timestamp in the stream.

In [4]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 5

queue_rdds = ticket_flights.randomSplit([0.1 for _ in range(N_SPLIT)])

In [5]:
def stream_pipeline(source):
    def updateFunc(new_values, running_tuple):
        new_sum = [field[0] * field[1] for field in new_values]
        new_count = [field[1] for field in new_values]

        if not running_tuple:
            running_avg = 0
            running_count = 0
        else:
            running_avg, running_count = running_tuple

        running_sum = running_avg * running_count
        update_sum = sum(new_sum, running_sum)
        update_count = sum(new_count, running_count)
        update_avg = update_sum / update_count
        return update_avg, update_count

    target = (
        source.map(lambda x: (x.split(",")[2], float(x.split(",")[3])))
        .mapValues(lambda x: (x, 1))
        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
        .mapValues(lambda x: (x[0] / x[1], x[1]))
        .updateStateByKey(updateFunc)
    )
    return target

### Compute and aggregate the average over streaming data
* In the order of Key: (avg, count)

In [6]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# init input stream
inputStream = ssc.queueStream(queue_rdds)

# call stream pipeline
out = stream_pipeline(inputStream)

# log
out.pprint()

# run streaming for 5 sec.
ssc.start()
time.sleep(5)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

-------------------------------------------
Time: 2021-04-23 18:42:49
-------------------------------------------
('Economy', (16026.36755262944, 184545))
('Comfort', (32702.691980731084, 3529))
('Business', (51113.696660482376, 21560))

-------------------------------------------
Time: 2021-04-23 18:42:50
-------------------------------------------
('Economy', (15985.060594134651, 368435))
('Comfort', (32660.580376382703, 6961))
('Business', (50977.0381416156, 42919))

-------------------------------------------
Time: 2021-04-23 18:42:51
-------------------------------------------
('Economy', (15964.777683903543, 552749))
('Comfort', (32637.568687939845, 10373))
('Business', (51024.595218543254, 64541))

-------------------------------------------
Time: 2021-04-23 18:42:52
-------------------------------------------
('Economy', (15956.222049400712, 736791))
('Comfort', (32696.83722949989, 13817))
('Business', (51076.47892230868, 86110))

-------------------------------------------
Tim

```
[('Economy', (15959.813334810322, 920793)),
 ('Comfort', (32740.552888786075, 17291)),
 ('Business', (51143.416138681925, 107642))]
```