# Imports

In [18]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pathlib import Path

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
DATA_PATH = Path('data/')
!ls {DATA_PATH}

README.md                    [1m[36mmulticlass-classification[m[m
[1m[36mactivity-data[m[m                [1m[36mregression[m[m
[1m[36mbike-data[m[m                    [1m[36mretail-data[m[m
[1m[36mbinary-classification[m[m        sample_libsvm_data.txt
[1m[36mclustering[m[m                   sample_movielens_ratings.txt
[1m[36mdeep-learning-images[m[m         [1m[36msimple-ml[m[m
[1m[36mflight-data[m[m                  [1m[36msimple-ml-integers[m[m
[1m[36mflight-data-hive[m[m             [1m[36msimple-ml-scaling[m[m


In [3]:
spark = (SparkSession
         .builder
         .appName('Structured Streaming')
         .getOrCreate())
spark

# Structured Streaming

In [4]:
static_df = (spark
             .read
             .format('csv')
             .option('inferschema', 'true')
             .option('header', 'true')
             .load(str(DATA_PATH / 'retail-data/by-day/*.csv')))

In [5]:
static_schema = static_df.schema
static_schema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,DoubleType,true),StructField(Country,StringType,true)))

In [6]:
static_df.createOrReplaceTempView('retail_data')

In [7]:
static_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
static_df.selectExpr('CustomerID', '(Quantity * UnitPrice) as total_cost', 'InvoiceDate').show()

+----------+------------------+-------------------+
|CustomerID|        total_cost|        InvoiceDate|
+----------+------------------+-------------------+
|   14075.0|             85.92|2011-12-05 08:38:00|
|   14075.0|              25.0|2011-12-05 08:38:00|
|   14075.0|39.599999999999994|2011-12-05 08:38:00|
|   14075.0|              30.0|2011-12-05 08:38:00|
|   14075.0|15.299999999999999|2011-12-05 08:38:00|
|   14075.0|              40.8|2011-12-05 08:38:00|
|   14075.0|              39.6|2011-12-05 08:38:00|
|   14075.0|             40.56|2011-12-05 08:38:00|
|   18180.0|              17.0|2011-12-05 08:39:00|
|   18180.0|              17.0|2011-12-05 08:39:00|
|   18180.0|              19.8|2011-12-05 08:39:00|
|   18180.0|14.850000000000001|2011-12-05 08:39:00|
|   18180.0|              15.6|2011-12-05 08:39:00|
|   18180.0|              15.6|2011-12-05 08:39:00|
|   18180.0|              15.0|2011-12-05 08:39:00|
|   18180.0| 9.899999999999999|2011-12-05 08:39:00|
|   18180.0|

In [9]:
(static_df
 .selectExpr('CustomerID', '(Quantity * UnitPrice) as total_cost', 'InvoiceDate')
 .groupBy('CustomerID', F.window(F.col('InvoiceDate'), '1 day'))
 .sum('total_cost')
 .show(5))

+----------+--------------------+-----------------+
|CustomerID|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|[2011-12-04 18:00...|            -37.6|
|   14126.0|[2011-11-28 18:00...|643.6300000000001|
|   13500.0|[2011-11-15 18:00...|497.9700000000001|
|   17160.0|[2011-11-07 18:00...|516.8499999999999|
|   15608.0|[2011-11-10 18:00...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



In [10]:
streaming_df = (spark
                .readStream.format('csv')
                .schema(static_schema)
                .option('maxFilesPerTrigger', 1)
                .option('header', 'true')
                .load(str(DATA_PATH / 'retail-data/by-day/*.csv')))
streaming_df.isStreaming

True

In [11]:
purchase_by_customer_by_hour = (streaming_df
                                .selectExpr('CustomerID', '(Quantity * UnitPrice) as total_cost', 'InvoiceDate')
                                .groupBy('CustomerID', F.window(F.col('InvoiceDate'), '1 day'))
                                .sum('total_cost'))

In [12]:
(purchase_by_customer_by_hour
 .writeStream
 .format('memory')
 .queryName('customer_purchases')
 .outputMode('complete')
 .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f8fe24e53d0>

In [13]:
spark.sql("SELECT * FROM customer_purchases ORDER BY 3 DESC").show(5)

+----------+--------------------+------------------+
|CustomerID|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   12415.0|[2011-03-02 18:00...|          16558.14|
|   15769.0|[2011-03-16 19:00...|           10065.0|
|      null|[2011-03-16 19:00...| 7876.000000000018|
|   12435.0|[2011-03-16 19:00...|3978.9899999999993|
|      null|[2011-03-02 18:00...| 3538.750000000001|
+----------+--------------------+------------------+
only showing top 5 rows



# Machine Learning

In [14]:
static_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [15]:
prep_df = (static_df
          .na.fill(0)
          .withColumn('day_of_week', F.date_format(F.col('InvoiceDate'), 'EEEE'))
          .coalesce(5))
prep_df.explain()

== Physical Plan ==
Coalesce 5
+- *(1) Project [InvoiceNo#16, StockCode#17, Description#18, coalesce(Quantity#19, 0) AS Quantity#397011, InvoiceDate#20, coalesce(nanvl(UnitPrice#21, null), 0.0) AS UnitPrice#397012, coalesce(nanvl(CustomerID#22, null), 0.0) AS CustomerID#397013, Country#23, date_format(cast(InvoiceDate#20 as timestamp), EEEE, Some(America/Chicago)) AS day_of_week#397022]
   +- FileScan csv [InvoiceNo#16,StockCode#17,Description#18,Quantity#19,InvoiceDate#20,UnitPrice#21,CustomerID#22,Country#23] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/imad/Documents/courses/data-engineering/big-data/notebooks/Spark-De..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<InvoiceNo:string,StockCode:string,Description:string,Quantity:int,InvoiceDate:string,UnitP...




In [16]:
prep_df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|     Monday|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|     Monday|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|     Monday|
|   580538|    21544|SKULLS  WATER TRA..

In [17]:
train_df = prep_df.where("InvoiceDate < '2011-07-01'")
test_df = prep_df.where("InvoiceDate >= '2011-07-01'")
train_df.count(), test_df.count()

(245903, 296006)

In [19]:
indexer = (StringIndexer()
           .setInputCol('day_of_week')
           .setOutputCol('day_of_week_index'))

encoder = (OneHotEncoder()
           .setInputCol('day_of_week_index')
           .setOutputCol('day_of_week_encoded'))

vector_assembler = (VectorAssembler()
                    .setInputCols(['UnitPrice', 'Quantity', 'day_of_week_encoded'])
                    .setOutputCol('features'))
           
tfms_pipeline = (Pipeline()
                 .setStages([indexer, encoder, vector_assembler]))

In [20]:
fitted_pipeline = tfms_pipeline.fit(train_df)

In [21]:
transformed_train_df = fitted_pipeline.transform(train_df)
transformed_test_df = fitted_pipeline.transform(test_df)

In [22]:
transformed_train_df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [23]:
transformed_train_df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|day_of_week_index|day_of_week_encoded|            features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[2.95,...|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[2.1,8...|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdo

In [24]:
kmeans = KMeans().setK(20).setSeed(1)

In [25]:
kmeans_model = kmeans.fit(transformed_train_df)

In [26]:
transformed_train_df.take(1)

[Row(InvoiceNo='537226', StockCode='22811', Description='SET OF 6 T-LIGHTS CACTI ', Quantity=6, InvoiceDate='2010-12-06 08:34:00', UnitPrice=2.95, CustomerID=15987.0, Country='United Kingdom', day_of_week='Monday', day_of_week_index=2.0, day_of_week_encoded=SparseVector(5, {2: 1.0}), features=SparseVector(7, {0: 2.95, 1: 6.0, 4: 1.0}))]

In [27]:
ClusteringEvaluator().evaluate(kmeans_model.transform(transformed_train_df))

0.6842576726028763

In [28]:
ClusteringEvaluator().evaluate(kmeans_model.transform(transformed_test_df))

0.5427938390491535

In [29]:
kmeans_model.clusterCenters()

[array([4.09293606, 2.73959977, 0.18896861, 0.19629835, 0.18589279,
        0.16698473, 0.14731972]),
 array([1.0400e+00, 7.4215e+04, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00]),
 array([ 1.0400e+00, -7.4215e+04,  0.0000e+00,  1.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00]),
 array([ 3.897e+04, -1.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
         0.000e+00,  1.000e+00]),
 array([ 1.6670865e+04, -1.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  1.0000000e+00,  0.0000000e+00]),
 array([ 7.5000e-03, -9.4045e+03,  2.5000e-01,  7.5000e-01,  0.0000e+00,
         0.0000e+00,  0.0000e+00]),
 array([ 7.385808e+03, -6.000000e-01,  0.000000e+00,  8.000000e-01,
         2.000000e-01,  0.000000e+00,  0.000000e+00]),
 array([ 1.94092118e+03, -1.76470588e-01,  5.88235294e-02,  1.76470588e-01,
         4.11764706e-01,  0.00000000e+00,  3.52941176e-01]),
 array([8.407500e-01, 1.213475e+03, 2.125000e-01, 2.500000e-01,
        1.125000e-01, 

In [30]:
train_df.groupBy('day_of_week').count().show()

+-----------+-----+
|day_of_week|count|
+-----------+-----+
|  Wednesday|42638|
|    Tuesday|47974|
|     Friday|36718|
|   Thursday|48348|
|     Monday|43900|
|     Sunday|26325|
+-----------+-----+



In [31]:
kmeans_model.transform(transformed_train_df).show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|day_of_week_index|day_of_week_encoded|            features|prediction|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+----------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[2.95,...|         0|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(7,[0,1,4],[2.1,8...|         0|
|   537226|    22927|GREEN GIANT GARDE...|       2