<div style="font-size:18pt; padding-top:20px; text-align:center"> Обработка потоковых данных с <b>Spark Streaming</b></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin.study@mail.ru)</span></div>

<p>Подключение стилей для оформления тетради</p>

In [None]:
from IPython.core.display import HTML
HTML("<link rel='stylesheet' href='css/style.css'>")

Stateless transformation

In [None]:
# -*- coding: utf-8 -*-

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create Spark Context
sc = SparkContext(appName="WordCount")

#sc.setLogLevel("INFO")

# Create Streaming Context
ssc = StreamingContext(sc, 10)

# Create a stream
lines = ssc.socketTextStream("localhost", 9999)

# TRANSFORMATION FOR EACH BATCH =========
#Transform: ["a a b", "b c"] => ["a", "a", "b", "b", "c"] 
words = lines.flatMap(lambda line: line.split())

#words.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Transform: ["a", "a", "b", "b", "c"] => [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] 
word_tuples = words.map(lambda word: (word, 1))

# Transform: [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] => [("a",3),("b",2), ("c",1)]
counts = word_tuples.reduceByKey(lambda x1, x2: x1 + x2)
# =======================================

# Print the result (10 records)
#counts.pprint()
counts.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Start Spark Streaming
ssc.start()

# Await terminiation
ssc.awaitTermination()

In [None]:
nc -lk 9999

In [None]:
spark2-submit --master local[2] /home/cloudera/workspace/spark-streaming/wordCountWindow.py 

Stateful transformation

In [None]:
-*- coding: utf-8 -*-

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create Spark Context
sc = SparkContext(appName="WordCount")

#sc.setLogLevel("INFO")

# Create Streaming Context
ssc = StreamingContext(sc, 10)

ssc.checkpoint("tmp_spark_streaming1") # == /user/cloudera/tmp_spark_streaming

# Create a stream
lines = ssc.socketTextStream("localhost", 9999)

# TRANSFORMATION FOR EACH BATCH =========
#Transform: ["a a b", "b c"] => ["a", "a", "b", "b", "c"] 
words = lines.flatMap(lambda line: line.split())

# Transform: ["a", "a", "b", "b", "c"] => [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] 
word_tuples = words.map(lambda word: (word, 1))

# Transform: [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] => [("a",3),("b",2), ("c",1)]
counts = word_tuples.reduceByKey(lambda x1, x2: x1 + x2)

# =======================================

def updateTotalCount(currentCount, countState):
    if countState is None:
        countState = 0
    return sum(currentCount, countState)

totalCounts = counts.updateStateByKey(updateTotalCount)

# Print the result (10 records)
#counts.pprint()
totalCounts.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Start Spark Streaming
ssc.start()

# Await terminiation
ssc.awaitTermination()

In [None]:
Window transformation

In [None]:
# -*- coding: utf-8 -*-

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create Spark Context
sc = SparkContext(appName="WordCount")

#sc.setLogLevel("INFO")

# Create Streaming Context
ssc = StreamingContext(sc, 10)

ssc.checkpoint("tmp_spark_streaming1") # == /user/cloudera/tmp_spark_streaming

# Create a stream
lines = ssc.socketTextStream("localhost", 9999)

# TRANSFORMATION FOR EACH BATCH =========
#Transform: ["a a b", "b c"] => ["a", "a", "b", "b", "c"] 
words = lines.flatMap(lambda line: line.split())

#words.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Transform: ["a", "a", "b", "b", "c"] => [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] 
word_tuples = words.map(lambda word: (word, 1))

# Transform: [("a",1), ("a",1), ("b",1), ("b",1), ("c",1)] => [("a",3),("b",2), ("c",1)]
counts = word_tuples.reduceByKey(lambda x1, x2: x1 + x2)

# =======================================

#windowed_word_counts = counts.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 20, 10)
windowed_word_counts = counts.reduceByKeyAndWindow(lambda x, y: x + y, None, 20, 10)

# Print the result (10 records)
#counts.pprint()
windowed_word_counts.transform(lambda rdd: rdd.coalesce(1)).saveAsTextFiles("file:///home/cloudera/workspace/spark-streaming/output/wordCount")

# Start Spark Streaming
ssc.start()

# Await terminiation
ssc.awaitTermination()