# Introduction to SparkStreaming

## Spark Streaming Initiallization Script

In [1]:
# !pip install jupyter-tabnine
# !jupyter nbextension install --py jupyter_tabnine
!jupyter nbextension enable --py jupyter_tabnine
!jupyter serverextension enable --py jupyter_tabnine

Enabling notebook extension jupyter_tabnine/main...
      - Validating: [32mOK[0m
Enabling: jupyter_tabnine
- Writing config: /home/ateeb/.jupyter
    - Validating...
      jupyter_tabnine  [32mOK[0m


In [2]:
#!/usr/bin/env python3
import sys
import os

# getting the directory where Spark was installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = '/opt/spark'

# python variable to store the root path for later reference
SPARK_HOME = os.environ['SPARK_HOME']

# adding pyspark and py4j packages paths to python path env variable
sys.path.insert(0,os.path.join(SPARK_HOME, "python"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib", 'py4j-0.10.9-src.zip'))
sys.path.insert(0, os.path.join(SPARK_HOME, 'python', 'lib', 'pyspark.zip'))

from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.executer.memory','1g')
conf.set('spark.cores.max','2')

# give name to your spark application
conf.setAppName("SparkStreamingApp")

# create a spark context object 
# note: Execute only once otherwise results in Context Errors
# create the spark context with 2 threads for streaming
sc = SparkContext('local[2]',conf=conf)

#### Once the above script is executed you can view the Spark instance info here http://localhost:4040

### Creating SparkStreaming context from Spark context

In [3]:
from pyspark.streaming import StreamingContext

#Create streaming context 
#micro-batch size = 2(sec)
ssc = StreamingContext(sc,2)

### Streaming with Simple Data

In [4]:
# an array to be used to create a stream of data
vc = [[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]]

# convert data to a list of RDDs
dvc = [sc.parallelize(i,1) for i in vc]

# creating a streaming queue
input_stream = ssc.queueStream(dvc)

In [5]:
# function (transformations/actions) to perfrom on each micro-batch
def get_output(rdd):
    print (rdd.collect())

# the below code will execute for each batch that is created from the incoming stream of data
input_stream.foreachRDD(get_output)

#### Start the streaming, view the console and http://localhost:4040 for info

In [6]:
ssc.start()

[1, 2]
[3, 4]
[5, 6]
[7, 8]
[9, 10]
[11, 12]
[]
[]


In [7]:
ssc.stop()

### Streaming with TCP/IP data

In [8]:
#Create streaming context with latency of 1
#micro-batch size = 3(sec)
sc = SparkContext.getOrCreate(conf=conf)
ssc = StreamingContext(sc,3)

In [9]:
lines = ssc.socketTextStream("localhost",9000)

# map each line to a list of words
# make each entry a key value pair
# add the values by key to get no. of occurance of each word
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word,1))
wordCounts = pairs.reduceByKey(lambda x,y: x+y)
wordCounts.pprint(5)

linesCount = 0
totalLines = 0
def computeMetrics(rdd):
    global totalLines
    global linesCount
    linesCount = rdd.count()
    totalLines += rdd.count()
    print(rdd.collect())
    print("Lines in RDD: ",linesCount," Total Lines: ", totalLines)
    
lines.foreachRDD(computeMetrics)

# function to act upon the windowed RDD
def windowMetrics(rdd):
    print("Window RDD Size: ",rdd.count())
    print(rdd.collect())

# creating a window of 2*3 i.e including two batches and sliding every 3 secs
# lines.window(window size, sliding interval)
windowedRDD = lines.window(6,3)
# call method for every new update in window
windowedRDD.foreachRDD(windowMetrics)

#### staring the stream

In [10]:
ssc.start()

-------------------------------------------
Time: 2021-04-05 16:25:27
-------------------------------------------
('i', 1)
('thought', 1)
('was', 1)
('do', 1)
('like', 1)
...

['i thought da vinci code was great', 'I do like Angels and Demons more then The Da Vinci Code.']
Lines in RDD:  2  Total Lines:  2
Window RDD Size:  2
['i thought da vinci code was great', 'I do like Angels and Demons more then The Da Vinci Code.']
-------------------------------------------
Time: 2021-04-05 16:25:30
-------------------------------------------
('liked', 1)
('Da', 1)
('but', 1)
('the', 1)
('seem', 1)
...

["I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own."]
Lines in RDD:  1  Total Lines:  3
Window RDD Size:  3
['i thought da vinci code was great', 'I do like Angels and Demons more then The Da Vinci Code.', "I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own."]
-------------------------------------------
Time: 2021-04-05 16:25:33
-----------------------

In [11]:
ssc.stop()

In [12]:
print("Overall lines :", totalLines)

Overall lines : 15


### Testing with different windowing intervals

In [13]:
#Create streaming context with latency of 1
#micro-batch size = 3(sec)
sc = SparkContext.getOrCreate(conf=conf)
ssc = StreamingContext(sc,3)

In [14]:
lines = ssc.socketTextStream("localhost",9000)


linesCount = 0
totalLines = 0
def computeMetrics(rdd):
    global totalLines
    global linesCount
    linesCount = rdd.count()
    totalLines += rdd.count()
    print("\n",":"*100)
    print(rdd.collect())
    print("Lines in RDD: ",linesCount," Total Lines: ", totalLines)
    
lines.foreachRDD(computeMetrics)

# function to act upon the windowed RDD
def windowMetrics(rdd):
    print("Window RDD Size: ",rdd.count())
    print(rdd.collect())

# creating a window of 2*3 i.e including two batches and sliding every 3 secs
# lines.window(window size, sliding interval)
windowedRDD = lines.window(6,6)
# call method for every new update in window
windowedRDD.foreachRDD(windowMetrics)

In [15]:
ssc.start()


 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
['i loved the da vinci code.']
Lines in RDD:  1  Total Lines:  1

 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
['about how much i hate the da vinci code.']
Lines in RDD:  1  Total Lines:  2
Window RDD Size:  2
['i loved the da vinci code.', 'about how much i hate the da vinci code.']

 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
["RACHEL you could of told me your nans a libarian before i said i hated'The da vinci code '!!", 'I loved The Da Vinci Code.']
Lines in RDD:  2  Total Lines:  4

 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
["The Da Vinci Code's backtory on various religious historical figures and such were interesting at times", 'I HATED Da Vinci Code!', 'Had an interesting conversation with one of the profe

In [16]:
ssc.stop()

In [17]:
print("Overall lines :", totalLines)

Overall lines : 14


In [18]:
sc.stop()