In [2]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.getOrCreate()


# Structured Streaming
With structured streaming, you can take the same operations that you perform in batch mode using spark's structured APIs and run them in a streaming fashion. This can reduce latency and allow for incremental processing. The best thing is that it allows you to rapidly and quickly extract value out of streaming systems with virtually no code changes
In our example, we will use a retail dataset, one that has specific dates and times for us to use. We put it in this format to simulate data being produced ina consistent and regular manner by a different process. This is retail data so imagine that these are being produced by retail stores and sent to a location where they will be read by7 our structured streami9ng job

In [3]:
staticDataFrame = spark\
                    .read\
                    .format('csv')\
                    .option('header', 'true')\
                    .option('inferSchema', 'true')\
                    .load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/retail-data/by-day/*.csv')

staticDataFrame.createOrReplaceTempView('retail_data')
staticSchema = staticDataFrame.schema

                                                                                

We will take a look at the sale hours during which a given customer makes a large purchase. The window function will include a data over the time series

In [5]:
staticDataFrame.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import window, column, desc, col

staticDataFrame\
    .selectExpr(
        'CustomerID',
        '(UnitPrice * Quantity) as total_cost',
        'InvoiceDate'
    )\
    .groupBy(
        col('CustomerId'), window(col('InvoiceDate'), '1 day')
    )\
    .sum('total_cost')\
    .show(5)



+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|{2011-12-05 03:00...|            -37.6|
|   14126.0|{2011-11-29 03:00...|643.6300000000001|
|   13500.0|{2011-11-16 03:00...|497.9700000000001|
|   17160.0|{2011-11-08 03:00...|516.8499999999999|
|   15608.0|{2011-11-11 03:00...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [9]:
spark.conf.set('spark.sql.shuffle.partitions', '5')

Let us look at the streaming code. We will use readstream instead of stream and add FilesPerTrigger option which simply specifies the numbert of files we should read at once

In [10]:
streamingDataFrame = spark.readStream\
                            .schema(staticSchema)\
                            .option('maxFilesPerTrigger', 1)\
                            .format('csv')\
                            .option('header', 'true')\
                            .load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/retail-data/by-day/*.csv')

streamingDataFrame.isStreaming

                                                                                

True

In [13]:
streamingDataFrame\
    .selectExpr(
        'CustomerId',
        '(UnitPrice * Quantity) as total_cost',
        'InvoiceDate'
    )\
    .groupBy(
        col('CustomerId'), window(col('InvoiceDate'), '1 day')
    )\
    .sum('total_cost')

DataFrame[CustomerId: double, window: struct<start:timestamp,end:timestamp>, sum(total_cost): double]

In [14]:
purchaseByCustomer = streamingDataFrame\
                                .selectExpr(
                                    'CustomerId',
                                    '(UnitPrice * quantity) as total_cost',
                                    'InvoiceDate'
                                )\
                                .groupBy(
                                    col('CustomerId'), window(col('InvoiceDate'), '1 day')
                                )\
                                .sum('total_cost')

Streaming actions are a bit different from our conventional static action because we're going to be populating data somewhere instead of just calling something like count. The action we will use will output to an in-memory table that we will update after each trigger. In this case, each trigger is based on an individual file. Spark will muatte the data in the in-memory table such that we will always have the highest value as specified 

In [15]:
purchaseByCustomer.writeStream\
                    .format('memory')\
                    .queryName('customer_purchases')\
                    .outputMode('complete')\
                    .start()

22/10/31 23:21:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e3364ec8-c3f1-44ff-8b76-f5afed63f6bb. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/10/31 23:21:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f08f4f04f40>

                                                                                

In [None]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
"""
).show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17450.0|{2011-09-20 03:00...|          71601.44|
|      null|{2011-03-29 03:00...| 33521.39999999998|
|   18102.0|{2011-09-15 03:00...|31661.540000000005|
|      null|{2010-12-21 03:00...|31347.479999999938|
|   18102.0|{2010-12-07 03:00...|          25920.37|
+----------+--------------------+------------------+
only showing top 5 rows



                                                                                

22/10/31 23:23:03 WARN FileStreamSource: Listed 305 file(s) in 2513 ms


                                                                                

22/10/31 23:23:08 WARN FileStreamSource: Listed 305 file(s) in 5187 ms


                                                                                

22/10/31 23:23:12 WARN FileStreamSource: Listed 305 file(s) in 4404 ms


                                                                                

22/10/31 23:23:17 WARN FileStreamSource: Listed 305 file(s) in 4901 ms


                                                                                

22/10/31 23:23:20 WARN FileStreamSource: Listed 305 file(s) in 2515 ms


                                                                                

# Machine Learning and Advanced Analytics

In [18]:
from pyspark.sql.functions import date_format, col

preppedDataFrame = staticDataFrame\
                        .na.fill(0)\
                        .withColumn('day_of_week', date_format(col('InvoiceDate'), 'EEEE'))\
                        .coalesce(5)

                                                                                

Let us split the data into train and test splits

In [19]:
trainDataFrame = preppedDataFrame\
                            .where("InvoiceDate < '2011-07-01' ")
testDataFrame = preppedDataFrame\
                            .where("InvoiceDate >= '2011-07-01'")
trainDataFrame.count()
testDataFrame.count()

                                                                                

22/10/31 23:29:54 WARN FileStreamSource: Listed 305 file(s) in 2355 ms




22/10/31 23:29:57 WARN FileStreamSource: Listed 305 file(s) in 2022 ms


                                                                                

296006

                                                                                

In [20]:
trainDataFrame.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdom|     Monday|
|   537226|    20802|SMALL GLASS SUNDA...|       6|2010-12-06 08:34:00|     1.65|   15987.0|United Kingdom|     Monday|
|   537226|    22052|VINTAGE CARAVAN G...|      25|2010-12-06 08:34:00|     0.42|   15987.0|United Kingdom|     Monday|
+---------+---------+-------------------

                                                                                

In [22]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
indexer = StringIndexer()\
                .setInputCol('day_of_week')\
                .setOutputCol('day_of_week_index')


encoder = OneHotEncoder()\
                .setInputCol('day_of_week_index')\
                .setOutputCol('day_of_week_encoded')

# spark takes as input a vector type, which must be set in numerical values
vectorAssembler = VectorAssembler()\
                .setInputCols(['UnitPrice', 'Quantity', 'day_of_week_encoded'])\
                .setOutputCol('features')



                                                                                

Next, we will set this up ina  pipeline so that any future data we need to transform can go through the exact same process

In [23]:
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
                            .setStages([indexer, encoder, vectorAssembler])

                                                                                

In [24]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

transformedPipeline = fittedPipeline.transform(trainDataFrame)

                                                                                

22/10/31 23:49:26 WARN FileStreamSource: Listed 305 file(s) in 3144 ms


                                                                                

                                                                                

In [25]:
spark.stop()

22/10/31 23:52:16 ERROR MicroBatchExecution: Query customer_purchases [id = 75a5878b-e28c-44ad-851c-153dc6eeca77, runId = 50ae28f7-47b5-4ac6-85c0-18b9f78e209b] terminated with error
org.apache.spark.SparkException: Job 4053 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1188)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1186)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1186)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2887)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2784)
	at org.apache.spark.SparkContext.$anonfun$stop$11(SparkContext.scala:2095)
	at org.apache.spark.util.Utils$.tryLogNonFatalErr