# Streaming in PySpark

In [13]:
# load libs
import findspark

# store location of spark files
findspark.init('/home/matt/spark-3.0.2-bin-hadoop3.2')

# load libs
import pyspark

# SparkContext is an old school Spark session that allows streaming
# because streaming is new and the session functionality doesn't yet support it
# hence using contexts which are the old RDD code versions
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# create a context session
# contexts need 2 cores (i.e. [2]) to allow 2 threads
# the second part is just the name of the context (just like a session)
sc = SparkContext('local[2]', 'NetworkWordCount')

# create a streaming session
# it inherits the above 2 threaded session/context
# and allows a batch interval of 1 second (i.e. rate of new streaming data)
ssc = StreamingContext(sc, 1)

# create DStream (data stream)
# connect to localhost port: 9999
lines = ssc.socketTextStream('localhost', 9999)

In [14]:
# create a list of words based off whatever text is entered into the above interface/terminal
# lambda/anonymous expressions are very common in streaming, as you need quick, simple functions
words = lines.flatMap(lambda line: line.split(' '))

# for each word, create a tuple of the word and a single count
# when working with RDDs and streams, it's very common to end up with tuples
# the below format is a common format to end up with when streaming data
pairs = words.map(lambda word: (word, 1))

# summarize word counts
# reduceByKey knows to automatically group by keys in the input tuples (i.e. word in the above code)
# it then applies the specified function (here, a simply sum) to all matching keys
# so num1+num2 isn't just occurring once for one match, it's summing all second elements of tuples
# for every matching word in the pairs object above
word_counts = pairs.reduceByKey(lambda num1, num2: num1 + num2)

# print output
word_counts.pprint()

In [11]:
# start session (run this when you've created your terminal session on 9999)
ssc.start()

-------------------------------------------
Time: 2021-03-08 11:30:31
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:32
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2021-03-08 11:30:33
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:34
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:35
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:36
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:37
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:38
-------------------------------------------

-------------------------------------------
Time: 2021-03-08 11:30:39
--

In [12]:
# close context/session once done
sc.stop()

-------------------------------------------
Time: 2021-03-08 11:31:12
-------------------------------------------

