In [0]:
%scala
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions.{expr, sum}
import org.apache.spark.sql.expressions.Window

In [0]:
%scala
val sparkConf = new SparkConf().setAppName("ScalaRFMAnalysis")
val spark = SparkSession.builder.config(sparkConf).getOrCreate()

In [0]:
%scala
var retail = spark.read
  .option("inferSchema", "true")
  .option("header", "true")
  .option("sep", ",")
  .csv("/FileStore/tables/jarvis/retail.csv")


In [0]:
%scala
retail.printSchema()

In [0]:
%scala
display(retail)

invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01T07:45:00.000+0000,6.95,13085.0,United Kingdom
489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01T07:45:00.000+0000,6.75,13085.0,United Kingdom
489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01T07:45:00.000+0000,6.75,13085.0,United Kingdom
489434,22041,"""RECORD FRAME 7"""" SINGLE SIZE """,48,2009-12-01T07:45:00.000+0000,2.1,13085.0,United Kingdom
489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01T07:45:00.000+0000,1.25,13085.0,United Kingdom
489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01T07:45:00.000+0000,1.65,13085.0,United Kingdom
489434,21871,SAVE THE PLANET MUG,24,2009-12-01T07:45:00.000+0000,1.25,13085.0,United Kingdom
489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01T07:45:00.000+0000,5.95,13085.0,United Kingdom
489435,22350,CAT BOWL,12,2009-12-01T07:46:00.000+0000,2.55,13085.0,United Kingdom
489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01T07:46:00.000+0000,3.75,13085.0,United Kingdom


In [0]:
%scala
var rowCount = retail.count()
println(s"Total row count in the 'retail' DataFrame: $rowCount")

In [0]:
%scala
retail = retail.filter(col("customer_id").isNotNull)
rowCount = retail.count()
println(s"Total row count in the 'retail' DataFrame: $rowCount")

# Total Invoice Amount Distribution
### Calculate the invoice amount. 
Note: an invoice consists of one or more items where each item is a row in the df.

In [0]:
%scala
// Calculate the total amount for each item
val retailWithTotalAmount = retail.withColumn("total_amount", expr("quantity * unit_price"))

// Group by invoice number and calculate the total invoice amount
val invoiceAmounts = retailWithTotalAmount
                        .groupBy("invoice_no")
                        .agg(sum("total_amount").alias("invoice_amount"))

// Show the resulting DataFrame
display(invoiceAmounts)


invoice_no,invoice_amount
489677,192.0
C491017,-4.95
491045,303.2
491658,155.05999999999997
C491705,-22.5
C492541,-99.0
C493168,-177.60000000000002
493542,118.75
493977,275.95
C493984,-10.43


In [0]:
%scala
// Create a new column with YYYYMMDD format
retail = retail.withColumn("invoice_month", date_format(col("invoice_date"), "yyyyMM"))
retail = retail.withColumn("invoice_month", to_date(concat(col("invoice_month"), lit("01")), "yyyyMMdd"))

// Identify canceled orders and calculate the number of canceled invoices per month
val canceledOrders = retail
  .filter(col("invoice_no").rlike("C"))
  .groupBy("invoice_month")
  .agg(count("*").alias("canceled_orders"))

// Calculate the number of placed orders per month
val totalOrders = retail
  .groupBy("invoice_month")
  .agg(count("*").alias("total_orders"))

// Calculate the difference between placed and canceled orders to get monthly placed orders
val monthlyPlacedOrders = totalOrders
  .join(canceledOrders, "invoice_month", "left_outer")
  .na.fill(0)
  .withColumn("placed_orders", expr("total_orders - (2 * canceled_orders)"))
  .orderBy("invoice_month")

// Show the resulting DataFrame
display(monthlyPlacedOrders)


invoice_month,total_orders,canceled_orders,placed_orders
2009-12-01,31760,999,29762
2010-01-01,22439,661,21117
2010-02-01,23906,537,22832
2010-03-01,33114,812,31490
2010-04-01,27833,595,26643
2010-05-01,29604,960,27684
2010-06-01,31950,759,30432
2010-07-01,27746,713,26320
2010-08-01,26942,549,25844
2010-09-01,35386,784,33818


Databricks visualization. Run in Databricks to view.

# Monthly Sales
### Calculate the monthly sales data
Plot a chart to show monthly sales (e.g. x-asix=year_month, y-axis=sales_amount)

In [0]:
%scala
// Filter for numeric invoice numbers
val salesDf = retail.filter(col("invoice_no").rlike("^[0-9]+$"))
display(salesDf)
// Calculate invoice amount
val salesDfWithAmount = salesDf.withColumn("invoice_amount", col("quantity") * col("unit_price"))

// Calculate the monthly sales data
val windowSpec = Window.partitionBy("invoice_month")
val monthlySales = salesDfWithAmount
  .groupBy("invoice_month")
  .agg(sum("invoice_amount").alias("sales_amount"))
  .orderBy("invoice_month")

// Show the resulting DataFrame
display(monthlySales)

invoice_month,sales_amount
2009-12-01,686654.1599999949
2010-01-01,557319.0620000134
2010-02-01,506371.06600001536
2010-03-01,699608.9910000064
2010-04-01,594609.1919999977
2010-05-01,599985.7900000075
2010-06-01,639066.5800000058
2010-07-01,591636.7400000112
2010-08-01,604242.6499999989
2010-09-01,831615.0009999905


Databricks visualization. Run in Databricks to view.

# Monthly Sales Growth

In [0]:
%scala
// Filter for numeric invoice numbers
val salesGrowthDf = retail.filter(col("invoice_no").rlike("^[0-9]+$"))

// Calculate invoice amount
val salesGrowthDfWithAmount = salesGrowthDf.withColumn("invoice_amount", col("quantity") * col("unit_price"))

// Calculate the monthly sales data
val windowSpec = Window.orderBy("invoice_month")
val monthlySalesGrowth = salesGrowthDfWithAmount
  .groupBy("invoice_month")
  .agg(sum("invoice_amount").alias("sales_amount"))
  .orderBy("invoice_month")

// Calculate the lagged sales data for the previous month
val monthlySalesGrowthWithLag = monthlySalesGrowth.withColumn("previous_sales", lag("sales_amount", 1).over(windowSpec))

// Calculate the percentage change in sales
val monthlySalesGrowthWithPercentage = monthlySalesGrowthWithLag
  .withColumn("sales_growth", ((col("sales_amount") - col("previous_sales")) / col("previous_sales")) * 100)

// Show the resulting DataFrame
display(monthlySalesGrowthWithPercentage)


invoice_month,sales_amount,previous_sales,sales_growth
2009-12-01,686654.1599999949,,
2010-01-01,557319.0620000134,686654.1599999949,-18.835551509654067
2010-02-01,506371.06600001536,557319.0620000134,-9.141620926649166
2010-03-01,699608.9910000064,506371.06600001536,38.16132831728289
2010-04-01,594609.1919999977,699608.9910000064,-15.008354716814637
2010-05-01,599985.7900000075,594609.1919999977,0.9042238284149832
2010-06-01,639066.5800000058,599985.7900000075,6.513619264215875
2010-07-01,591636.7400000112,639066.5800000058,-7.421736871296599
2010-08-01,604242.6499999989,591636.7400000112,2.130684108628456
2010-09-01,831615.0009999905,604242.6499999989,37.62931183358077


Databricks visualization. Run in Databricks to view.

In [0]:
%scala
// Group by invoice_month and count distinct CustomerID for each month
val monthlyActiveUsers = retail
  .groupBy("invoice_month")
  .agg(countDistinct("customer_id").alias("active_users"))
  .orderBy("invoice_month")

// Show the resulting DataFrame
display(monthlyActiveUsers)

invoice_month,active_users
2009-12-01,1045
2010-01-01,786
2010-02-01,807
2010-03-01,1111
2010-04-01,998
2010-05-01,1062
2010-06-01,1095
2010-07-01,988
2010-08-01,964
2010-09-01,1202


Databricks visualization. Run in Databricks to view.

In [0]:
%scala
// Calculate the first purchase year-month for each customer
val windowSpec = Window.partitionBy("customer_id").orderBy("invoice_month")
val firstPurchaseMonth = retail
  .select("customer_id", "invoice_month")
  .distinct()
  .withColumn("first_purchase_month", min("invoice_month").over(windowSpec))

// Join the first purchase data with the retail data to identify new/existing users
val retailWithFirstPurchase = retail.join(firstPurchaseMonth, Seq("customer_id", "invoice_month"), "left")

// Create a new column to categorize users as "New" or "Existing"
val retailWithUserType = retailWithFirstPurchase.withColumn("user_type", when(col("invoice_month") === col("first_purchase_month"), "New").otherwise("Existing"))

// Group by invoice_month and user_type to count new and existing users for each month
val userTypeCounts = retailWithUserType
  .groupBy("invoice_month", "user_type")
  .agg(countDistinct("customer_id").alias("user_count"))
  .orderBy("invoice_month")

// Show the resulting DataFrame
display(userTypeCounts)

invoice_month,user_type,user_count
2009-12-01,New,1045
2010-01-01,New,394
2010-01-01,Existing,392
2010-02-01,Existing,444
2010-02-01,New,363
2010-03-01,New,436
2010-03-01,Existing,675
2010-04-01,New,291
2010-04-01,Existing,707
2010-05-01,New,254


Databricks visualization. Run in Databricks to view.

In [0]:
%scala
// Calculate Recency, Frequency, and Monetary Value for each customer
val rfmData = retail
  .groupBy("customer_id")
  .agg(
    max("invoice_date").alias("Recency"),
    countDistinct("invoice_no").alias("Frequency"),
    sum("unit_price").alias("MonetaryValue")
  )

// Calculate Recency in days
val maxDate = retail.selectExpr("max(invoice_date)").collect()(0)(0).asInstanceOf[java.sql.Date] // Corrected the cast to Date
val rfmDataWithRecency = rfmData.withColumn("Recency", datediff(current_date(), col("Recency"))) // Used current_date() to get a timestamp

// Create bins or segments for Recency, Frequency, and Monetary Value
val quantiles = rfmDataWithRecency.stat.approxQuantile(Array("Recency", "Frequency", "MonetaryValue"), Array(0.25, 0.5, 0.75), 0.05)
var rfmSegments = rfmDataWithRecency
  .withColumn("RecencyScore",
    when(col("Recency") <= quantiles(0)(0), 4)
      .when(col("Recency") <= quantiles(0)(1), 3)
      .when(col("Recency") <= quantiles(0)(2), 2)
      .otherwise(1)
  )
  .withColumn("FrequencyScore",
    when(col("Frequency") <= quantiles(1)(0), 1)
      .when(col("Frequency") <= quantiles(1)(1), 2)
      .when(col("Frequency") <= quantiles(1)(2), 3)
      .otherwise(4)
  )
  .withColumn("MonetaryScore",
    when(col("MonetaryValue") <= quantiles(2)(0), 4)
      .when(col("MonetaryValue") <= quantiles(2)(1), 3)
      .when(col("MonetaryValue") <= quantiles(2)(2), 2)
      .otherwise(1)
  )
  .withColumn("RFM_Score", concat(col("RecencyScore"), col("FrequencyScore"), col("MonetaryScore")))

// Show the results
display(rfmSegments)


customer_id,Recency,Frequency,MonetaryValue,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score
18051,4931,8,113.36,1,4,3,143
13623,4327,15,1082.4999999999998,3,4,1,341
14832,4927,3,920.01,1,2,1,121
17389,4297,77,2447.380000000001,4,4,1,441
15447,4627,6,121.92,2,3,3,233
15727,4313,15,2474.36,4,4,1,441
17753,4761,5,205.84,1,3,2,132
17679,4349,11,291.41,3,4,2,342
13285,4320,6,539.3000000000001,4,3,1,431
13289,5020,1,70.25,1,1,3,113


In [0]:
%scala
// Define the segmentation mapping
val segMap = Map(
  "^[1-2][1-2]$" -> "Hibernating",
  "^[1-2][3-4]$" -> "At Risk",
  "^[1-2]5$" -> "Can't Lose",
  "^3[1-2]$" -> "About to Sleep",
  "^33$" -> "Need Attention",
  "^[3-4][4-5]$" -> "Loyal Customers",
  "^41$" -> "Promising",
  "^51$" -> "New Customers",
  "^[4-5][2-3]$" -> "Potential Loyalists",
  "^5[4-5]$" -> "Champions"
)

// Create a new 'Segment' column by combining RecencyScore and FrequencyScore
rfmSegments = rfmSegments.withColumn("Segment", concat(col("RecencyScore"), col("FrequencyScore")))

// Replace the Segment values based on the segMap
val updatedRfmSegments = segMap.foldLeft(rfmSegments) { case (df, (pattern, segmentName)) =>
  df.withColumn("Segment", when(col("Segment").rlike(pattern), segmentName).otherwise(col("Segment")))
}

// Group by Segment and calculate mean and count
val segmentedDF = updatedRfmSegments
  .groupBy("Segment")
  .agg(
    mean("Recency").alias("RecencyMean"),
    mean("Frequency").alias("FrequencyMean"),
    mean("MonetaryValue").alias("MonetaryValueMean"),
    count("Segment").alias("CustomerCount")
  )

// Filter out rows with "nan" in the Segment column
val filteredSegmentedDF = segmentedDF.filter(col("Segment").isNotNull)

// Show the results
display(filteredSegmentedDF)

Segment,RecencyMean,FrequencyMean,MonetaryValueMean,CustomerCount
Promising,4310.180616740088,1.5726872246696035,107.21700440528632,227
At Risk,4573.1346765641565,7.837751855779428,610.1436097560971,943
About to Sleep,4350.981818181818,1.8709090909090909,124.72738363636358,550
Hibernating,4710.767249757046,1.6141885325558796,146.07566763848394,2058
Potential Loyalists,4309.021868787277,4.7673956262425445,324.47602385685883,503
Loyal Customers,4321.517241379311,22.22100313479624,1390.2217068965506,1276
Need Attention,4351.698701298701,5.259740259740259,324.56987272727264,385
