## Predicting the number of customers

In [4]:
# Read in data from ETL.py file
%run "../scripts/ETL.py"

                                                                                

22/09/27 14:05:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [5]:
data = spark.read.parquet("../data/tables/full_join.parquet")

In [6]:
data.limit(5)

                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Dui Nec Corporation,19933438190,tent and awning s...,2.22,c,Brian Wright,761 Bennett Mountain,TAS,Male,19933438190,3.434666831614868,28ccb7e6-ebe7-496...,2021-06-17,2582,743,7016,601021010,601021010,Risdon Vale,58036866,1963,1540,3501,6,Tasmania,3571,3563
Vel Est Tempor LLP,62694031334,"computers, comput...",5.76,a,Brian Wright,761 Bennett Mountain,TAS,Male,62694031334,30.583567099986173,d5902566-aa14-4e4...,2021-05-07,2582,743,7016,601021010,601021010,Risdon Vale,58036866,1963,1540,3501,6,Tasmania,3571,3563
Aliquam Nisl Corp...,55325186693,"gift, card, novel...",4.44,b,Brian Wright,761 Bennett Mountain,TAS,Male,55325186693,68.91163123686927,234aedaa-1657-466...,2021-07-20,2582,743,7016,601021010,601021010,Risdon Vale,58036866,1963,1540,3501,6,Tasmania,3571,3563
Urna Incorporated,78905182348,"watch, clock, and...",4.65,b,Brian Wright,761 Bennett Mountain,TAS,Male,78905182348,15.129648652621713,51d13e41-34a5-496...,2021-06-23,2582,743,7016,601021010,601021010,Risdon Vale,58036866,1963,1540,3501,6,Tasmania,3571,3563
Purus Gravida Sag...,21772962346,florists supplies...,6.63,a,Brian Wright,761 Bennett Mountain,TAS,Male,21772962346,0.7692714085929745,af89a162-38c7-48a...,2021-04-21,2582,743,7016,601021010,601021010,Risdon Vale,58036866,1963,1540,3501,6,Tasmania,3571,3563


In [16]:
subset = data.select("merchant_name", "merchant_abn", "order_datetime")

In [17]:
subset.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)



### Creating Features

In [32]:
from pyspark.sql import functions as F

states = data.groupBy("merchant_name", "merchant_abn", "order_datetime").pivot('state').count().fillna(0)
states.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,ACT,NSW,QLD,SA,TAS,VIC,WA
Euismod Urna Inst...,91923722701,2021-10-11,0,11,18,16,4,23,7
Ac Turpis Egestas PC,49758515423,2021-06-12,0,1,1,1,0,1,2
Sed Facilisis Corp.,83177825742,2022-05-07,2,2,3,3,2,2,2
Nunc PC,80701973278,2021-03-29,0,1,1,0,0,1,0
Leo In Consulting,86578477987,2022-10-02,6,92,72,64,18,98,74


In [33]:
total_count = data.groupBy("merchant_name", "merchant_abn", "order_datetime").count()
total_count.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,count
Nascetur Ridiculu...,82065156333,2021-08-02,75
Pede Suspendisse ...,12034469787,2021-07-30,10
Auctor Quis Corp.,17739089622,2021-09-20,15
Blandit At LLC,11439466003,2022-07-09,33
Fringilla Ornare ...,39473654822,2021-07-19,1


In [34]:
genders = data.groupBy("merchant_name", "merchant_abn", "order_datetime").pivot('gender').count().fillna(0)
genders.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,Female,Male,Undisclosed
Ipsum Dolor Sit C...,80324045558,2021-11-27,272,254,48
Congue Corporation,15612785317,2021-11-08,1,0,0
Euismod Enim LLC,80779820715,2022-05-30,20,22,4
Cubilia Curae Pha...,25856363362,2022-02-06,1,2,0
Feugiat Placerat Ltd,98533840054,2021-08-05,1,0,0


In [35]:
states = states.withColumnRenamed("merchant_name", "merchant_name_st") \
    .withColumnRenamed("merchant_abn", 'merchant_abn_st') \
    .withColumnRenamed("order_datetime", 'order_datetime_st')

genders = genders.withColumnRenamed("merchant_name", "merchant_name_g") \
    .withColumnRenamed("merchant_abn", 'merchant_abn_g') \
    .withColumnRenamed("order_datetime", 'order_datetime_g')

In [39]:
# Joining data with features
joined = total_count.join(states,(total_count["merchant_name"] == states["merchant_name_st"]) & \
                 (total_count["merchant_abn"] == states["merchant_abn_st"]) & \
                 (total_count["order_datetime"] == states["order_datetime_st"])) \
    .join(genders,(total_count["merchant_name"] == genders["merchant_name_g"]) & \
                 (total_count["merchant_abn"] == genders["merchant_abn_g"]) & \
                 (total_count["order_datetime"] == genders["order_datetime_g"]))

In [41]:
agg = joined.drop("merchant_name_st", "merchant_abn_st", "order_datetime_st", 
                  "merchant_name_g", "merchant_abn_g", "order_datetime_g")

agg.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,count,ACT,NSW,QLD,SA,TAS,VIC,WA,Female,Male,Undisclosed
A Aliquet Ltd,22578135006,2021-03-30,1,0,0,0,1,0,0,0,1,0,0
A Aliquet Ltd,22578135006,2021-03-31,1,0,0,1,0,0,0,0,1,0,0
A Aliquet Ltd,22578135006,2021-04-07,1,0,0,0,0,0,1,0,0,0,1
A Aliquet Ltd,22578135006,2021-04-27,3,0,0,1,1,0,1,0,1,2,0
A Aliquet Ltd,22578135006,2021-04-29,1,0,0,0,0,0,0,1,0,1,0


### MONTHLY AGG

In [66]:
from pyspark.sql.functions import year, month

data_month = data.withColumn('year',year(data.order_datetime))
data_month = data_month.withColumn('month',month(data_month.order_datetime))
data_month.limit(5)

states_month = data_month.groupBy("merchant_name", "merchant_abn", "year", "month").pivot('state').count().fillna(0)
total_count_month = data_month.groupBy("merchant_name", "merchant_abn", "year", "month").count()
genders_month = data_month.groupBy("merchant_name", "merchant_abn", "year", "month").pivot('gender').count().fillna(0)

states_month = states_month.withColumnRenamed("merchant_name", "merchant_name_st") \
    .withColumnRenamed("merchant_abn", 'merchant_abn_st') \
    .withColumnRenamed("year", 'year_st') \
    .withColumnRenamed("month", 'month_st')
genders_month = genders_month.withColumnRenamed("merchant_name", "merchant_name_g") \
    .withColumnRenamed("merchant_abn", 'merchant_abn_g') \
    .withColumnRenamed("year", 'year_g') \
    .withColumnRenamed("month", 'month_g')

joined_month = total_count_month.join(states_month,(total_count_month["merchant_name"] == states_month["merchant_name_st"]) & \
                 (total_count_month["merchant_abn"] == states_month["merchant_abn_st"]) & \
                 (total_count_month["year"] == states_month["year_st"]) & \
                 (total_count_month["month"] == states_month["month_st"]))\
    .join(genders_month,(total_count_month["merchant_name"] == genders_month["merchant_name_g"]) & \
                 (total_count_month["merchant_abn"] == genders_month["merchant_abn_g"]) & \
                 (total_count_month["year"] == genders_month["year_g"]) & \
                 (total_count_month["month"] == genders_month["month_g"]))

agg_month = joined_month.drop("merchant_name_st", "merchant_abn_st", "year_st", "month_st", 
                  "merchant_name_g", "merchant_abn_g", "year_g", "month_g")

agg_month.limit(20)

                                                                                

merchant_name,merchant_abn,year,month,count,ACT,NSW,QLD,SA,TAS,VIC,WA,Female,Male,Undisclosed
Posuere At Velit PC,99241925348,2021,12,64,1,9,14,5,2,18,15,28,32,4
Non Vestibulum In...,49891706470,2022,8,10754,128,2191,1898,1535,534,2446,2022,4718,4922,1114
Accumsan Sed LLP,91630014920,2022,1,86,3,19,12,13,4,24,11,35,44,7
Ullamcorper Velit...,95508140753,2022,3,306,4,64,43,47,14,76,58,138,129,39
Sodales At LLC,34179569263,2022,7,944,8,216,177,134,34,193,182,404,438,102
Nulla Aliquet LLP,26008308191,2022,5,112,0,28,18,13,5,24,24,50,51,11
Curabitur Massa C...,83620670827,2022,9,563,9,130,82,71,27,135,109,261,239,63
Lorem Donec Found...,39469311070,2021,11,239,3,55,43,39,8,41,50,91,122,26
Leo Elementum Sem...,36196225600,2021,11,847,8,176,152,91,42,199,179,371,389,87
Nibh Donec Incorp...,40555823280,2022,1,551,10,115,111,69,25,123,98,232,247,72


In [67]:
agg_month.count()

                                                                                

75083

In [68]:
agg_month.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- count: long (nullable = false)
 |-- ACT: long (nullable = true)
 |-- NSW: long (nullable = true)
 |-- QLD: long (nullable = true)
 |-- SA: long (nullable = true)
 |-- TAS: long (nullable = true)
 |-- VIC: long (nullable = true)
 |-- WA: long (nullable = true)
 |-- Female: long (nullable = true)
 |-- Male: long (nullable = true)
 |-- Undisclosed: long (nullable = true)



In [69]:
agg_projection = agg_month.select("merchant_name", "merchant_abn", "count", "year", "month")
agg_projection.limit(5)

                                                                                

merchant_name,merchant_abn,count,year,month
Posuere At Velit PC,99241925348,64,2021,12
Non Vestibulum In...,49891706470,10754,2022,8
Accumsan Sed LLP,91630014920,86,2022,1
Ullamcorper Velit...,95508140753,306,2022,3
Sodales At LLC,34179569263,944,2022,7


In [None]:
agg_projection.withColumn("prev_month")

### Modelling

In [46]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
import numpy as np
import pandas as pd

# https://goodboychan.github.io/python/datacamp/pyspark/2020/08/11/01-Regression-in-PySpark.html#One-Hot-Encoding

In [49]:
agg1 = StringIndexer(inputCol='merchant_name', outputCol='merchant_name_idx').fit(agg).transform(agg)
onehot = OneHotEncoder(inputCols=['merchant_name_idx'], outputCols=['merchant_name_dummy'])
onehot = onehot.fit(agg1)
agg2 = onehot.transform(agg1)
agg2.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,count,ACT,NSW,QLD,SA,TAS,VIC,WA,Female,Male,Undisclosed,merchant_name_idx,merchant_name_dummy
A Aliquet Ltd,22578135006,2021-03-30,1,0,0,0,1,0,0,0,1,0,0,1367.0,"(4021,[1367],[1.0])"
A Aliquet Ltd,22578135006,2021-03-31,1,0,0,1,0,0,0,0,1,0,0,1367.0,"(4021,[1367],[1.0])"
A Aliquet Ltd,22578135006,2021-04-07,1,0,0,0,0,0,1,0,0,0,1,1367.0,"(4021,[1367],[1.0])"
A Aliquet Ltd,22578135006,2021-04-27,3,0,0,1,1,0,1,0,1,2,0,1367.0,"(4021,[1367],[1.0])"
A Aliquet Ltd,22578135006,2021-04-29,1,0,0,0,0,0,0,1,0,1,0,1367.0,"(4021,[1367],[1.0])"


In [52]:
# Assembling Features Vector
assembler = VectorAssembler(inputCols=['merchant_name_dummy', 'ACT', 'NSW', 'QLD', 'SA', 'TAS', 'VIC', 
                                       'WA', 'Female', 'Male', 'Undisclosed'], outputCol='features')
agg_with_features = assembler.transform(agg2)

agg_with_features.limit(5)

                                                                                

merchant_name,merchant_abn,order_datetime,count,ACT,NSW,QLD,SA,TAS,VIC,WA,Female,Male,Undisclosed,merchant_name_idx,merchant_name_dummy,features
A Aliquet Ltd,22578135006,2021-03-30,1,0,0,0,1,0,0,0,1,0,0,1367.0,"(4021,[1367],[1.0])","(4031,[1367,4024,..."
A Aliquet Ltd,22578135006,2021-03-31,1,0,0,1,0,0,0,0,1,0,0,1367.0,"(4021,[1367],[1.0])","(4031,[1367,4023,..."
A Aliquet Ltd,22578135006,2021-04-07,1,0,0,0,0,0,1,0,0,0,1,1367.0,"(4021,[1367],[1.0])","(4031,[1367,4026,..."
A Aliquet Ltd,22578135006,2021-04-27,3,0,0,1,1,0,1,0,1,2,0,1367.0,"(4021,[1367],[1.0])","(4031,[1367,4023,..."
A Aliquet Ltd,22578135006,2021-04-29,1,0,0,0,0,0,0,1,0,1,0,1367.0,"(4021,[1367],[1.0])","(4031,[1367,4027,..."


In [55]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

train, test = agg_with_features.randomSplit([0.8, 0.2])

# Create a regression object and train on training data
regression = LinearRegression(featuresCol='features', labelCol='count').fit(train)

# Create predictions for the test data
predictions = regression.transform(test)
predictions.select('count', 'prediction').show(5, False)

# Calculate the RMSE on test data
RegressionEvaluator(labelCol='count', metricName='rmse').evaluate(predictions)

[Stage 1002:>                                                       (0 + 8) / 9]

22/09/27 20:51:50 WARN Instrumentation: [9b168944] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

22/09/27 20:52:24 WARN Instrumentation: [9b168944] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


                                                                                

+-----+------------------+
|count|prediction        |
+-----+------------------+
|1    |1.0000028702662689|
|1    |1.00000144595388  |
|1    |0.9999999510487684|
|2    |2.0000019600816743|
|1    |0.9999978940487763|
+-----+------------------+
only showing top 5 rows



                                                                                

5.558028045437251e-06

In [56]:
predictions.select('count', 'prediction').show(20, False)

[Stage 1097:>                                                       (0 + 1) / 1]

+-----+------------------+
|count|prediction        |
+-----+------------------+
|1    |1.0000028702662689|
|1    |1.00000144595388  |
|1    |0.9999999510487684|
|2    |2.0000019600816743|
|1    |0.9999978940487763|
|1    |0.9999997105109651|
|2    |1.9999969716385613|
|4    |3.999997929301428 |
|1    |1.0000009264518672|
|3    |2.999997414260697 |
|2    |2.0000029343821804|
|13   |12.999990964074936|
|1    |0.9999996172660707|
|4    |3.999996575581761 |
|2    |2.000002794299328 |
|3    |2.999993933861252 |
|4    |4.000000865939076 |
|6    |5.999993027818121 |
|3    |2.999995990861244 |
|4    |3.999992918976616 |
+-----+------------------+
only showing top 20 rows



                                                                                

- create features for states DONE
- create features for total customers DONE
- create feature for genders DONE
- create feature for month and year DONE
- aggregate by month and year DONE
- offset features for the month before
- create model DONE