## Spark Streaming exercises.
### These exercises are modified from spark-streaming/Spark-Streaming.ipynb to suit win10 environment without jdfc. All data will be loaded from csv/stream directly.
 

In [4]:
from pyspark import SparkConf, SparkContext, SQLContext
# Create a spark configuration with 20 threads.
# This code will run locally on master
conf = (SparkConf ()
        . setMaster("local[20]")
        . setAppName("sample app for reading streaming sources")
        . set("spark.executor.memory", "2g"))

sc = SparkContext(conf=conf)

In [5]:
# Parse a line of weather station data, returning the average wind direction measurement 
#
import re
def parse(line):
    match = re.search("Dm=(\d+)", line)
    if match:
        val = match.group(1)
        return [int(val)]
    return []

Step 3. Import and create streaming context. 

In [6]:
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc,1) # 1 specifies a batch interval of one second.


Step 4. Create DStream of weather data.

In [7]:
lines = ssc.socketTextStream("rtd.hpwren.ucsd.edu", 12020)

Step 5. Read measurement.

In [8]:
vals = lines.flatMap(parse)

Step 6. Create sliding window of data. 

In [9]:
window = vals.window(10,5) #combines the ten seconds worth of data and moves by five seconds.

Step 7. Define and call analysis function

In [10]:
def stats(rdd):
    print(rdd.collect())
    if rdd.count() >0:
        print("max = {}, min = {}".format(rdd.max(), rdd.min()))

In [11]:
window.foreachRDD(lambda rdd: stats(rdd))

Step 8: start the stream processing and stop it

In [12]:
ssc.start()

In [13]:
ssc.stop()