In [19]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import lbl2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from pyspark.sql.functions import date_format
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


In [20]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "10g")
    .getOrCreate()
)

22/10/05 11:19:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [21]:
# Read in data from ETL.py file
%run '../scripts/ETL.py' '../scripts/paths.json'
final_join3.limit(5)

22/10/05 11:19:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694


In [22]:
final_join3.count()

                                                                                

10540181

In [23]:
tagged_merchants = pd.read_csv("../data/curated/tagged_merchants.csv")
tagged_merchants = tagged_merchants.iloc[:,1:]
tagged_merchants.drop(['tags', 'name', 'cleaned_tags', 'store_type'], axis=1, inplace=True)
tagged_merchants.to_parquet("../data/curated/tagged_merchants.parquet")
tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [24]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [25]:
tagged_merchants_sdf.show(5)

+-------------------+--------------------+
|tagged_merchant_abn|            category|
+-------------------+--------------------+
|        10023283211|           Furniture|
|        10142254217|         Electronics|
|        10165489824|        Toys and DIY|
|        10187291046|        Toys and DIY|
|        10192359162|Books, Stationary...|
+-------------------+--------------------+
only showing top 5 rows



In [26]:
final_join3.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
INNER JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")

joint = joint.drop('tagged_merchant_abn')

In [27]:
joint.count()

                                                                                

10109254

In [28]:
joint.createOrReplaceTempView("group")

a = spark.sql(""" 

SELECT *, ((take_rate/100)*dollar_value) AS BNPL_earnings
FROM group
""")

In [29]:
# Extracting the year, month, day from the timestamp
from pyspark.sql.functions import year, month

a = a.withColumn('Year', year(a.order_datetime))
a = a.withColumn('Month',month(a.order_datetime))


In [30]:
a.show(5)



+--------------------+------------+--------------------+---------+--------------+--------------------+--------------------+-----+------+------------------+------------------+--------------------+--------------+-------+-----------+---------+---------+---------+-------------------+----------------+-----------+-------------+-------------+----------+---------------+---------------+---------------+--------------------+-------------------+----+-----+
|       merchant_name|merchant_abn|          categories|take_rate|revenue_levels|                name|             address|state|gender|trans_merchant_abn|      dollar_value|            order_id|order_datetime|user_id|consumer_id|postcodes|  int_sa2| SA2_code|           SA2_name|income_2018-2019|total_males|total_females|total_persons|state_code|     state_name|population_2020|population_2021|            category|      BNPL_earnings|Year|Month|
+--------------------+------------+--------------------+---------+--------------+--------------------+

                                                                                

In [31]:
a = a.drop('merchant_abn', 'categories','name', 'address', 'trans_merchant_abn', 'order_id','order_datetime','user_id','consumer_id','int_sa2',
'SA2_name','state_code','state_name','population_2020', 'population_2021','BNPL_earnings')

In [32]:
 
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
a.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in a.columns]
   ).show()



+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+
|merchant_name|take_rate|revenue_levels|state|gender|dollar_value|postcodes|SA2_code|income_2018-2019|total_males|total_females|total_persons|category|Year|Month|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+
|            0|        0|             0|    0|     0|           0|        0|       0|               0|          0|            0|            0|       0|   0|    0|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+



                                                                                

In [33]:
a.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- postcodes: string (nullable = true)
 |-- SA2_code: long (nullable = true)
 |-- income_2018-2019: long (nullable = true)
 |-- total_males: long (nullable = true)
 |-- total_females: long (nullable = true)
 |-- total_persons: long (nullable = true)
 |-- category: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)



In [34]:
a.createOrReplaceTempView("agg")

male = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS m_name, COUNT(gender) as males
FROM agg
WHERE gender = 'Male'
GROUP BY merchant_name, SA2_code, Year, Month
""")

male.show(5)

female = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS f_name, COUNT(gender) as females
FROM agg
WHERE gender = 'Female'
GROUP BY merchant_name, SA2_code, Year, Month
""")
female.show(5)

                                                                                

+--------------------+-----+
|              m_name|males|
+--------------------+-----+
|Semper Tellus PC1...|    2|
|Est Nunc Consulti...|   11|
|Ipsum Primis In I...|    2|
|Euismod In LLC601...|    1|
|Leo In Consulting...|    2|
+--------------------+-----+
only showing top 5 rows





+--------------------+-------+
|              f_name|females|
+--------------------+-------+
|Quis Tristique Lt...|      1|
|Nunc In Industrie...|      2|
|Nunc Sit LLC10902...|      2|
|Leo In Consulting...|      5|
|Risus Donec Assoc...|      1|
+--------------------+-------+
only showing top 5 rows



                                                                                

In [35]:
a.show(2)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+-----+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018-2019|total_males|total_females|total_persons|            category|Year|Month|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+-----+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|    8|
|Morbi Accumsan In...|     1.52|             c|  NSW|  Male|62.90176609196828|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|    5|
+--------------------+---------+--------------+-----+------+

                                                                                

In [37]:
a.createOrReplaceTempView("agg")

temp = spark.sql(""" 

SELECT merchant_name, COUNT(merchant_name) AS no_of_transactions, SA2_code, Year, Month, SUM(((take_rate/100)*dollar_value)) AS BNPL_earnings,
    CONCAT(merchant_name, SA2_code, Year, Month) AS join_col
FROM agg
GROUP BY merchant_name, SA2_code, Year, Month
""")

temp.show()




+--------------------+------------------+---------+----+-----+------------------+--------------------+
|       merchant_name|no_of_transactions| SA2_code|Year|Month|     BNPL_earnings|            join_col|
+--------------------+------------------+---------+----+-----+------------------+--------------------+
|  Pede Nonummy Corp.|                 6|205021084|2021|    8| 4.799853297090182|Pede Nonummy Corp...|
|    Semper Tellus PC|                 6|407031164|2021|    8| 5.130949438054894|Semper Tellus PC4...|
| Dui Nec Corporation|                 2|801041036|2021|    8|0.3360427752684496|Dui Nec Corporati...|
|Ac Turpis Egestas PC|                 1|405041127|2021|    7|10.308467169526438|Ac Turpis Egestas...|
|Lobortis Tellus C...|                 3|205021082|2021|    7|1.5850668161083081|Lobortis Tellus C...|
|   Est Ac Mattis Ltd|                 1|209011204|2021|    7| 7.740099999999999|Est Ac Mattis Ltd...|
| Egestas Blandit Ltd|                 2|601051031|2021|    8|10.54287484

                                                                                

In [38]:
temp.createOrReplaceTempView("gender_join")
male.createOrReplaceTempView("m")
female.createOrReplaceTempView("f")

temp2 = spark.sql(""" 

SELECT *
FROM gender_join
INNER JOIN m
ON gender_join.join_col = m.m_name
""")

temp2.createOrReplaceTempView("temp2")

temp3 = spark.sql(""" 

SELECT *
FROM temp2
INNER JOIN f
ON temp2.join_col = f.f_name
""")

temp3.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,BNPL_earnings,join_col,m_name,males,f_name,females
A Aliquet Ltd,2,401021010,2021,4,26.99240338763568,A Aliquet Ltd4010...,A Aliquet Ltd4010...,1,A Aliquet Ltd4010...,1
A Aliquet Ltd,2,603011065,2021,12,13.952609070322437,A Aliquet Ltd6030...,A Aliquet Ltd6030...,1,A Aliquet Ltd6030...,1
A Arcu Industries,2,124011453,2021,8,6.295097143230683,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Arcu Industries,2,211051282,2022,3,20.261430692795518,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Arcu Industries,2,214021379,2021,7,13.948663024022798,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1


In [39]:
a = a.withColumnRenamed('income_2018-2019',

    'income_2018_2019'    
)

a = a.withColumn('income_per_persons',
    (F.col('income_2018_2019')/F.col('total_persons'))
)


In [40]:
a.show(1)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+-----+------------------+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018_2019|total_males|total_females|total_persons|            category|Year|Month|income_per_persons|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+-----+------------------+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|    8|19291.422615738902|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+-----------------

                                                                                

In [41]:
a.createOrReplaceTempView("features")

e = spark.sql(""" 

SELECT merchant_name AS drop_name, FIRST(take_rate) AS take_rate, FIRST(revenue_levels) AS revenue_levels, FIRST(category) AS category,
    FIRST(total_males) AS males_in_SA2, FIRST(total_females) AS females_in_SA2, FIRST(income_per_persons) AS income_per_person
FROM features
GROUP BY merchant_name
""")

e.show(2)



+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|      drop_name|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|   A Associates|     4.95|             b|Books, Stationary...|        9762|         10846|22526.523772559674|
|A Felis Company|     4.32|             b|Books, Stationary...|        1080|          1051| 33927.61168708765|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
only showing top 2 rows



                                                                                

In [42]:
temp3.createOrReplaceTempView("edit")
e.createOrReplaceTempView("rates")

temp4 = spark.sql(""" 

SELECT *
FROM edit
INNER JOIN rates
ON edit.merchant_name = rates.drop_name
""")

train = temp4.drop('m_name', 'f_name', 'drop_name','join_col')

train.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,BNPL_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person
A Aliquet Ltd,2,401021010,2021,4,26.99240338763568,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Aliquet Ltd,2,603011065,2021,12,13.952609070322437,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Arcu Industries,2,124011453,2021,8,6.295097143230683,1,1,3.0,c,Furniture,4821,4683,25816.03452631579
A Arcu Industries,2,211051282,2022,3,20.261430692795518,1,1,3.0,c,Furniture,4821,4683,25816.03452631579
A Arcu Industries,2,214021379,2021,7,13.948663024022798,1,1,3.0,c,Furniture,4821,4683,25816.03452631579


In [43]:
train.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_transactions: long (nullable = false)
 |-- SA2_code: long (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- BNPL_earnings: double (nullable = true)
 |-- males: long (nullable = false)
 |-- females: long (nullable = false)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: long (nullable = true)
 |-- females_in_SA2: long (nullable = true)
 |-- income_per_person: double (nullable = true)



In [44]:
train_projection = train.select("merchant_name", "SA2_code", "Year", "Month", 'BNPL_earnings')
train_projection.limit(5)

                                                                                

merchant_name,SA2_code,Year,Month,BNPL_earnings
A Aliquet Ltd,401021010,2021,4,26.99240338763568
A Aliquet Ltd,603011065,2021,12,13.952609070322437
A Arcu Industries,124011453,2021,8,6.295097143230683
A Arcu Industries,211051282,2022,3,20.261430692795518
A Arcu Industries,214021379,2021,7,13.948663024022798


In [45]:
train_projection.count()

                                                                                

891622

In [46]:
train_projection = train_projection.withColumn("prev_year", \
              when(train_projection["Month"] == 1, train_projection['Year'] - 1).otherwise(train_projection['Year']))
train_projection = train_projection.withColumn("prev_month", \
              when(train_projection["Month"] == 1, 12).otherwise(train_projection['Month'] - 1))
train_projection = train_projection.drop("Year", "Month")
train_projection = train_projection.withColumnRenamed("BNPL_earnings", "future_earnings") \
                            .withColumnRenamed("merchant_name", "p_merchant_name") \
                            .withColumnRenamed("SA2_code", "p_SA2_code")
train_projection.limit(5)

                                                                                

p_merchant_name,p_SA2_code,future_earnings,prev_year,prev_month
A Aliquet Ltd,401021010,26.99240338763568,2021,3
A Aliquet Ltd,603011065,13.952609070322437,2021,11
A Arcu Industries,124011453,6.295097143230683,2021,7
A Arcu Industries,211051282,20.261430692795518,2022,2
A Arcu Industries,214021379,13.948663024022798,2021,6


In [47]:
train_projection.count()

                                                                                

891622

In [48]:
final_data = train.join(train_projection, (train.merchant_name == train_projection.p_merchant_name) & 
                           (train.SA2_code == train_projection.p_SA2_code) & 
                           (train.Year == train_projection.prev_year) & 
                           (train.Month == train_projection.prev_month), how = 'inner')

final_data = final_data.drop("p_merchant_name", "p_SA2_code","prev_year", "prev_month")
final_data.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,BNPL_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person,future_earnings
A Auctor Non Corp...,3,202031033,2021,11,12.370771858227933,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,10.904180800642823
A Auctor Non Corp...,3,205021082,2022,7,15.48985179311347,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,14.23972688266196
A Auctor Non Corp...,2,205031087,2021,9,12.924007756706878,1,1,5.58,a,Furniture,2067,2014,22634.72370679088,15.711660225849878
A Auctor Non Corp...,3,210021235,2022,9,9.041642383010874,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,10.781646624626871
A Auctor Non Corp...,4,211051282,2021,10,17.65711333410997,2,2,5.58,a,Furniture,2067,2014,22634.72370679088,24.92588449432683


In [49]:
final_data.count()

                                                                                

344019

In [50]:
final_data.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_transactions: long (nullable = false)
 |-- SA2_code: long (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- BNPL_earnings: double (nullable = true)
 |-- males: long (nullable = false)
 |-- females: long (nullable = false)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: long (nullable = true)
 |-- females_in_SA2: long (nullable = true)
 |-- income_per_person: double (nullable = true)
 |-- future_earnings: double (nullable = true)



In [51]:
final_data = final_data.withColumn('Year',

    F.col('Year').cast('STRING')

)

final_data = final_data.withColumn('Month',

    F.col('Month').cast('STRING')

)

final_data = final_data.withColumn('SA2_code',

    F.col('SA2_code').cast('STRING')

)

field = ['no_of_transactions', 'males', 'females', 'males_in_SA2', 'females_in_SA2']

for col in field:
    final_data = final_data.withColumn(col,

    F.col(col).cast('INT')

)

In [52]:
final_data.count()

                                                                                

344019

In [53]:
final_data.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_transactions: integer (nullable = false)
 |-- SA2_code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- BNPL_earnings: double (nullable = true)
 |-- males: integer (nullable = false)
 |-- females: integer (nullable = false)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: integer (nullable = true)
 |-- females_in_SA2: integer (nullable = true)
 |-- income_per_person: double (nullable = true)
 |-- future_earnings: double (nullable = true)



In [54]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(final_data).transform(final_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2', 'income_per_person', 'no_of_transactions','take_rate', 'BNPL_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

                                                                                

In [55]:
# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_earnings",
    "label"
)

In [56]:
# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [57]:
# Split the data into training and validation sets (30% held out for testing)

trainingData, testData = outdata1.randomSplit([0.7, 0.3], seed = 20)

In [58]:
trainingData.count(), testData.count()



22/10/05 11:33:27 WARN DAGScheduler: Broadcasting large task binary with size 1334.8 KiB




22/10/05 11:34:25 WARN DAGScheduler: Broadcasting large task binary with size 1334.8 KiB


                                                                                

(240552, 103467)

In [59]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")


# Train model.  
model = rf.fit(trainingData)

# Make predictions.
predictions_validation = model.transform(testData)



22/10/05 11:35:22 WARN DAGScheduler: Broadcasting large task binary with size 1336.9 KiB


                                                                                

22/10/05 11:35:23 WARN DAGScheduler: Broadcasting large task binary with size 1337.0 KiB


                                                                                

22/10/05 11:35:25 WARN DAGScheduler: Broadcasting large task binary with size 1340.9 KiB


                                                                                

22/10/05 11:35:26 WARN DAGScheduler: Broadcasting large task binary with size 1453.6 KiB


                                                                                

22/10/05 11:35:32 WARN DAGScheduler: Broadcasting large task binary with size 1515.2 KiB


                                                                                

22/10/05 11:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1635.9 KiB


                                                                                

22/10/05 11:35:37 WARN DAGScheduler: Broadcasting large task binary with size 1877.8 KiB


                                                                                

22/10/05 11:35:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

In [60]:
# Evaluate the validation set 

predictions_validation.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error

evaluator_train_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_train = evaluator_train_rmse.evaluate(predictions_validation)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_train)

evaluator_train_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae_train = evaluator_train_mae.evaluate(predictions_validation)
print("Mean Absolutee Error (MAE) on train data = %g" % mae_train)



22/10/05 11:36:40 WARN DAGScheduler: Broadcasting large task binary with size 1349.1 KiB


                                                                                

+------------------+------------------+--------------------+
|        prediction|             label|            features|
+------------------+------------------+--------------------+
| 18.37728284398241|10.371985605001125|(2085,[209,1093,2...|
|18.676659405208316|10.904180800642823|(2085,[209,988,20...|
|18.676659405208316| 16.17062601935629|(2085,[209,976,20...|
| 21.85738278888545| 24.92588449432683|(2085,[209,989,20...|
|18.467828631507555| 9.031191799689866|(2085,[209,1039,2...|
+------------------+------------------+--------------------+
only showing top 5 rows





22/10/05 11:37:37 WARN DAGScheduler: Broadcasting large task binary with size 1342.2 KiB


[Stage 2393:>                                                       (0 + 8) / 8]

22/10/05 11:37:39 WARN DAGScheduler: Broadcasting large task binary with size 1343.3 KiB
Root Mean Squared Error (RMSE) on train data = 17.0479




22/10/05 11:38:34 WARN DAGScheduler: Broadcasting large task binary with size 1342.1 KiB


[Stage 2500:>                                                       (0 + 8) / 8]

22/10/05 11:38:36 WARN DAGScheduler: Broadcasting large task binary with size 1343.2 KiB
Mean Absolutee Error (MAE) on train data = 10.1542


                                                                                

In [61]:
latest_year = train.select(max('Year')).collect()[0][0]
agg_month_1 = train.filter(train.Year == latest_year)
latest_month = agg_month_1.select(max('Month')).collect()[0][0]
predicting_data = agg_month_1.filter(train.Month == latest_month)
predicting_data = predicting_data.withColumn("future_earnings", lit(0))
predicting_data.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,BNPL_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person,future_earnings
A Auctor Non Corp...,3,125031480,2022,10,12.147884844463512,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,3,126011496,2022,10,12.1459052958297,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,3,211051282,2022,10,11.865180056741202,2,1,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,2,509021239,2022,10,6.2879797684448295,1,1,5.58,a,Furniture,2067,2014,22634.72370679088,0
A Auctor Non Corp...,6,509021240,2022,10,21.4050390183122,3,2,5.58,a,Furniture,2067,2014,22634.72370679088,0


In [62]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(predicting_data).transform(predicting_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2', 'income_per_person', 'no_of_transactions','take_rate', 'BNPL_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_earnings",
    "label"
)


# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [63]:
predictions_test = model.transform(outdata1)

In [64]:
predictions_test.show()



22/10/05 11:40:56 WARN DAGScheduler: Broadcasting large task binary with size 1190.6 KiB
+--------------------+------------------+---------+----+-----+-------------------+-----+-------+---------+--------------+--------------------+------------+--------------+------------------+-----+-----------------+------------+--------+---------+------------------+------------+------------------+------------------+-------------+-------------+------------------+-------------+--------------------+--------------------+------------------+
|       merchant_name|no_of_transactions| SA2_code|Year|Month|      BNPL_earnings|males|females|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|label|merchant_name_num|SA2_code_num|Year_num|Month_num|revenue_levels_num|category_num| merchant_name_vec|      SA2_code_vec|     Year_vec|    Month_vec|revenue_levels_vec| category_vec|            features|     indexedFeatures|        prediction|
+--------------------+-------------

                                                                                

In [65]:
predictions_test.createOrReplaceTempView("preds")

pred = spark.sql(""" 

SELECT merchant_name, SUM(prediction) AS total_earnings_of_BNPL
FROM preds
GROUP BY merchant_name

""")

pred.limit(5)

                                                                                

merchant_name,total_earnings_of_BNPL
Dictum Mi Incorpo...,12.742588325562595
Dictum Mi Limited,344.0498847901901
Donec Luctus Indu...,89.19811827893817
Elit Sed Consequa...,293.0795314879397
Hendrerit Consect...,57.34164746503168


In [66]:
pred.count()

                                                                                

1381

In [67]:
pred_df = pred.toPandas()

                                                                                

In [68]:
pred_df.to_csv("../data/curated/BNPL_earnings.csv")