# Compute the Average for Pseudo Streaming Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark import SparkContext
import time
import math

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate() 

sc =  SparkContext(appName="pyspark-notebook")
ticket_flights = sc.textFile('/data/ticket_flights.csv')

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-notebook, master=spark://spark-master:7077) created by getOrCreate at <ipython-input-1-50d5fe3ce415>:11 

In [2]:
sc = SparkContext.getOrCreate()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-notebook, master=spark://spark-master:7077) created by getOrCreate at <ipython-input-1-50d5fe3ce415>:11 

## Batch Data Processing from HW3
* In the order of Key: (avg, sum, count)

In [2]:
def map_avg(x):
    split = x.split(",")
    return split[2], float(split[3])

ticket_flights.map(map_avg) \
              .mapValues(lambda x: (x, 1)) \
              .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
              .mapValues(lambda x: (x[0]/x[1], x[0], x[1])).collect()

[('Economy', (15959.813334810322, 14695684400.0, 920793)),
 ('Comfort', (32740.552888786075, 566116900.0, 17291)),
 ('Business', (51143.416138681925, 5505179600.0, 107642))]

## Streaming Data Processing

### Generate Pseudo Streaming Dataset

* Randomly split the original rdd into a list of N_SPLIT rdds as the pseudo input for streaming data.
* Each of the RDD can be considered as a single batch data at certain timestamp in the stream.

In [3]:
# Create the queue through which RDDs can be pushed to a QueueInputDStream
N_SPLIT = 10

queue_rdds = ticket_flights.randomSplit([0.1 for _ in range(N_SPLIT)])

In [9]:
#Kafka consumer
from pyspark.streaming.kafka import KafkaUtils


### Compute and aggregate the average over streaming data
* In the order of Key: (avg, sum, count)

In [3]:
# initial spark streaming context with batch interval for 1 sec
ssc = StreamingContext(sc, 1)
zkQuorum = "192.168.50.232:9092"
topic = "tickets"
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
# define the function to update the inner state
def updateFunc(new_values, running_tuple):
    '''
        new_values: values in current data
        running_tuple: values in states
    '''
    new_sum = [field[0] for field in new_values]
    new_count = [field[1] for field in new_values]
    running_sum, running_count = running_tuple
    
    return sum(new_sum, running_sum), sum(new_count, running_count)

# checkpoints is required for state operation
ssc.checkpoint("./checkpoints")

# state initialization
initialStateRDD = sc.parallelize([(u'Economy', (0, 0)),
                                  (u'Comfort', (0, 0)),
                                  (u'Business', (0, 0))])

# map and reduce operation for streaming data
inputStream = ssc.queueStream(queue_rdds)
out = inputStream.map(map_avg).mapValues(lambda x: (x, 1)) \
                        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
                        .updateStateByKey(updateFunc, initialRDD=initialStateRDD) \
                        .mapValues(lambda x: (x[0] / x[1], x[0], x[1]))
# log
out.pprint()

# run streaming for 15 sec.
ssc.start()
time.sleep(15)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

NameError: name 'sc' is not defined