In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()

In [2]:
df = spark.read.csv('./dataset/OnlineRetail.csv', header = True)
df.show(10,0)

+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       

In [3]:
# Number of rows in df
df.count()

541909

In [4]:
# No of unique customers
df.select('CustomerID').distinct().count() 

4373

In [5]:
# What country do most purchases come from?

df.groupBy('Country').agg(countDistinct('CustomerID').alias('country_count')).orderBy(desc('country_count')).show()

+---------------+-------------+
|        Country|country_count|
+---------------+-------------+
| United Kingdom|         3950|
|        Germany|           95|
|         France|           87|
|          Spain|           31|
|        Belgium|           25|
|    Switzerland|           21|
|       Portugal|           19|
|          Italy|           15|
|        Finland|           12|
|        Austria|           11|
|         Norway|           10|
|        Denmark|            9|
|Channel Islands|            9|
|      Australia|            9|
|    Netherlands|            9|
|         Sweden|            8|
|         Cyprus|            8|
|          Japan|            8|
|         Poland|            6|
|         Greece|            4|
+---------------+-------------+
only showing top 20 rows



In [6]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
df = df.withColumn('date',to_timestamp("InvoiceDate", 'MM/dd/yyyy HH:mm'))
df.select(max("date")).show()
df.select(min("date")).show()

+-------------------+
|          max(date)|
+-------------------+
|2011-12-09 12:50:00|
+-------------------+

+-------------------+
|          min(date)|
+-------------------+
|2010-12-01 08:26:00|
+-------------------+



### RFM
RFM is commonly used in marketing to evaluate a client’s value based on their:

- Recency: How recently has each customer made a purchase?
- Frequency: How often have they bought something?
- Monetary Value: How much money do they spend on average when making purchases?


In [7]:
df = df.withColumn("from_date", lit("12/09/2011 12:50"))
df = df.withColumn('from_date',to_timestamp("from_date", 'MM/dd/yyyy HH:mm'))

df2=df.withColumn('from_date',to_timestamp(col('from_date'))).withColumn('recency',col("date").cast("long") - col('from_date').cast("long"))

In [8]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|               date|          from_date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|

In [9]:
df2.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+---------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|               date|          from_date|  recency|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+---------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|-32243040|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|-32243040|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01 08:26:00|2011-12-09 12:50:00|-32243040|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     1

In [10]:
df2 = df2.join(df2.groupBy('CustomerID').agg(max('recency').alias('recency')),on='recency',how='leftsemi')

In [11]:
df2.show(5)

+---------+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+
|  recency|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|               date|          from_date|
+---------+---------+---------+--------------------+--------+--------------+---------+----------+--------------+-------------------+-------------------+
|-32237820|   536384|    82484|WOOD BLACK BOARD ...|       3|12/1/2010 9:53|     6.45|     18074|United Kingdom|2010-12-01 09:53:00|2011-12-09 12:50:00|
|-32237820|   536384|    84755|COLOUR GLASS T-LI...|      48|12/1/2010 9:53|     0.65|     18074|United Kingdom|2010-12-01 09:53:00|2011-12-09 12:50:00|
|-32237820|   536384|    22464|HANGING METAL HEA...|      12|12/1/2010 9:53|     1.65|     18074|United Kingdom|2010-12-01 09:53:00|2011-12-09 12:50:00|
|-32237820|   536384|    21324|HANGING MEDINA LA...|       6|12/1/2010 9:53|     2

In [12]:
df2.printSchema()

root
 |-- recency: long (nullable = true)
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- from_date: timestamp (nullable = true)



In [13]:
df_freq = df2.groupBy('CustomerID').agg(count('InvoiceDate').alias('frequency'))

In [14]:
df_freq.show(5,0)

+----------+---------+
|CustomerID|frequency|
+----------+---------+
|17714     |10       |
|17551     |43       |
|13187     |37       |
|15052     |30       |
|17128     |14       |
+----------+---------+
only showing top 5 rows



In [15]:
df3 = df2.join(df_freq,on='CustomerID',how='inner')

In [16]:
df3.printSchema()

root
 |-- CustomerID: string (nullable = true)
 |-- recency: long (nullable = true)
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- from_date: timestamp (nullable = true)
 |-- frequency: long (nullable = false)



In [17]:
m_val = df3.withColumn('TotalAmount',col("Quantity") * col("UnitPrice"))

In [18]:
m_val = m_val.groupBy('CustomerID').agg(sum('TotalAmount').alias('monetary_value'))

In [19]:
finaldf = m_val.join(df3,on='CustomerID',how='inner')

In [20]:
finaldf = finaldf.select(['recency','frequency','monetary_value','CustomerID']).distinct()

In [21]:
finaldf.show(5)

+---------+---------+------------------+----------+
|  recency|frequency|    monetary_value|CustomerID|
+---------+---------+------------------+----------+
|-27656280|       10|             153.0|     17714|
|-31012680|       43|            306.84|     17551|
|-29094780|       37|236.01999999999995|     13187|
|-27550680|       30|            215.78|     15052|
|-28860480|       14|            157.09|     17128|
+---------+---------+------------------+----------+
only showing top 5 rows

