In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import lbl2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from pyspark.sql.functions import date_format
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "10g")
    .getOrCreate()
)

22/10/07 16:49:40 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 192.168.0.66 instead (on interface en0)
22/10/07 16:49:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/07 16:49:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/07 16:49:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Read in data from ETL.py file
%run '../scripts/outlier.py' '../scripts/paths.json'



22/10/07 16:49:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

22/10/07 16:50:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

22/10/07 16:51:16 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [4]:

tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [5]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [6]:
tagged_merchants_sdf.show(5)

+-------------------+--------------------+
|tagged_merchant_abn|            category|
+-------------------+--------------------+
|        10023283211|           Furniture|
|        10142254217|         Electronics|
|        10165489824|        Toys and DIY|
|        10187291046|        Toys and DIY|
|        10192359162|Books, Stationary...|
+-------------------+--------------------+
only showing top 5 rows



In [7]:
internal4.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
INNER JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")

joint = joint.drop('tagged_merchant_abn')

In [8]:
joint.count()

                                                                                

10109371

In [9]:
joint.createOrReplaceTempView("group")

a = spark.sql(""" 

SELECT *, (take_rate/100)*dollar_value AS percent
FROM group
""")

In [10]:
# Extracting the year, month, day from the timestamp
from pyspark.sql.functions import year, month

a = a.withColumn('Year', year(a.order_datetime))
a = a.withColumn('Month',month(a.order_datetime))


In [11]:
a.show(5)



+--------------------+------------+--------------------+---------+--------------+--------------------+--------------------+-----+------+------------------+------------------+--------------------+--------------+-------+-----------+-------+---------+----------+----------+---------+---------+-------------------+----------------+-----------+-------------+-------------+----------+---------------+---------------+---------------+--------------------------+--------------------------+--------------------+-------------------+----+-----+
|       merchant_name|merchant_abn|          categories|take_rate|revenue_levels|                name|             address|state|gender|trans_merchant_abn|      dollar_value|            order_id|order_datetime|user_id|consumer_id| suburb|postcodes|      long|       lat|  int_sa2| SA2_code|           SA2_name|income_2018-2019|total_males|total_females|total_persons|state_code|     state_name|population_2020|population_2021|fraud_probability_consumer|fraud_probabil

                                                                                

In [12]:
a = a.drop('merchant_abn', 'categories','name', 'address', 'trans_merchant_abn', 'order_id','order_datetime', 'consumer_id','int_sa2',
'SA2_name','state_code','state_name','population_2020', 'population_2021')

In [13]:
 
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
a.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in a.columns]
   ).show()



+-------------+---------+--------------+-----+------+------------+-------+------+---------+----+---+--------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------+-------+----+-----+
|merchant_name|take_rate|revenue_levels|state|gender|dollar_value|user_id|suburb|postcodes|long|lat|SA2_code|income_2018-2019|total_males|total_females|total_persons|fraud_probability_consumer|fraud_probability_merchant|category|percent|Year|Month|
+-------------+---------+--------------+-----+------+------------+-------+------+---------+----+---+--------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------+-------+----+-----+
|            0|        0|             0|    0|     0|           0|      0|     0|        0|   0|  0|       0|               0|          0|            0|            0|                         0|                         0|       0|      0|   0|    0|
+---

                                                                                

In [14]:
a.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- user_id: long (nullable = true)
 |-- suburb: string (nullable = true)
 |-- postcodes: string (nullable = true)
 |-- long: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- SA2_code: long (nullable = true)
 |-- income_2018-2019: long (nullable = true)
 |-- total_males: long (nullable = true)
 |-- total_females: long (nullable = true)
 |-- total_persons: long (nullable = true)
 |-- fraud_probability_consumer: double (nullable = false)
 |-- fraud_probability_merchant: double (nullable = false)
 |-- category: string (nullable = true)
 |-- percent: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)



In [15]:
a.createOrReplaceTempView("agg")

male = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS m_name, COUNT(gender) as males
FROM agg
WHERE gender = 'Male'
GROUP BY merchant_name, SA2_code, Year, Month
""")

male.show(5)

female = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month) AS f_name, COUNT(gender) as females
FROM agg
WHERE gender = 'Female'
GROUP BY merchant_name, SA2_code, Year, Month
""")
female.show(5)

                                                                                

+--------------------+-----+
|              m_name|males|
+--------------------+-----+
|Semper Tellus PC1...|    2|
|Est Nunc Consulti...|   11|
|Ipsum Primis In I...|    2|
|Euismod In LLC601...|    1|
|Leo In Consulting...|    2|
+--------------------+-----+
only showing top 5 rows





+--------------------+-------+
|              f_name|females|
+--------------------+-------+
|Quis Tristique Lt...|      1|
|Nunc In Industrie...|      2|
|Nunc Sit LLC10902...|      2|
|Leo In Consulting...|      5|
|Risus Donec Assoc...|      1|
+--------------------+-------+
only showing top 5 rows



                                                                                

In [16]:
a.show(2)



+--------------------+---------+--------------+-----+------+-----------------+-------+-------+---------+----------+----------+---------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------------------+------------------+----+-----+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|user_id| suburb|postcodes|      long|       lat| SA2_code|income_2018-2019|total_males|total_females|total_persons|fraud_probability_consumer|fraud_probability_merchant|            category|           percent|Year|Month|
+--------------------+---------+--------------+-----+------+-----------------+-------+-------+---------+----------+----------+---------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------------------+------------------+----+-----+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|   3698|JESMOND|     2299

                                                                                

In [17]:
a.createOrReplaceTempView("agg")

temp = spark.sql(""" 

SELECT merchant_name, COUNT(DISTINCT user_id) AS no_of_customers, SA2_code, Year, Month, SUM(dollar_value - percent) AS total_earnings,
    CONCAT(merchant_name, SA2_code, Year, Month) AS join_col
FROM agg 
GROUP BY merchant_name, SA2_code, Year, Month
""")

temp.show()


[Stage 296:>                                                        (0 + 1) / 1]

+--------------------+---------------+---------+----+-----+------------------+--------------------+
|       merchant_name|no_of_customers| SA2_code|Year|Month|    total_earnings|            join_col|
+--------------------+---------------+---------+----+-----+------------------+--------------------+
|Metus Sit Amet In...|              1|407021150|2021|    8| 66.38711669098895|Metus Sit Amet In...|
|     Ut Nisi Limited|              1|307031184|2021|    8| 89.47004686919264|Ut Nisi Limited30...|
|Dolor Dolor Indus...|              1|209041224|2021|    7| 32.32942469964551|Dolor Dolor Indus...|
|Ut Molestie Found...|              1|206041124|2021|    8| 327.0497650599944|Ut Molestie Found...|
|     Vivamus Sit LLC|              1|307031184|2021|    8| 244.8254892010557|Vivamus Sit LLC30...|
|Euismod Et Institute|              7|210021235|2021|    8| 194.5570510377872|Euismod Et Instit...|
|   Leo In Consulting|             20|210021235|2021|    8|1172.0943954300897|Leo In Consulting...|


                                                                                

In [18]:
temp.createOrReplaceTempView("gender_join")
male.createOrReplaceTempView("m")
female.createOrReplaceTempView("f")

temp2 = spark.sql(""" 

SELECT *
FROM gender_join
INNER JOIN m
ON gender_join.join_col = m.m_name
""")

temp2.createOrReplaceTempView("temp2")

temp3 = spark.sql(""" 

SELECT *
FROM temp2
INNER JOIN f
ON temp2.join_col = f.f_name
""")

temp3.limit(5)

                                                                                

merchant_name,no_of_customers,SA2_code,Year,Month,total_earnings,join_col,m_name,males,f_name,females
A Aliquet Ltd,2,401021010,2021,4,670.4857203238805,A Aliquet Ltd4010...,A Aliquet Ltd4010...,1,A Aliquet Ltd4010...,1
A Aliquet Ltd,2,603011065,2021,12,346.5799250465364,A Aliquet Ltd6030...,A Aliquet Ltd6030...,1,A Aliquet Ltd6030...,1
A Arcu Industries,2,124011453,2021,8,203.5414742977921,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Arcu Industries,2,211051282,2022,3,655.1195924003883,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1
A Arcu Industries,2,214021379,2021,7,451.0067711100705,A Arcu Industries...,A Arcu Industries...,1,A Arcu Industries...,1


In [19]:
a = a.withColumnRenamed('income_2018-2019',

    'income_2018_2019'    
)

a = a.withColumn('income_per_persons',
    (F.col('income_2018_2019')/F.col('total_persons'))
)


In [20]:
a.show(1)



+--------------------+---------+--------------+-----+------+-----------------+-------+-------+---------+----------+----------+---------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------------------+------------------+----+-----+------------------+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|user_id| suburb|postcodes|      long|       lat| SA2_code|income_2018_2019|total_males|total_females|total_persons|fraud_probability_consumer|fraud_probability_merchant|            category|           percent|Year|Month|income_per_persons|
+--------------------+---------+--------------+-----+------+-----------------+-------+-------+---------+----------+----------+---------+----------------+-----------+-------------+-------------+--------------------------+--------------------------+--------------------+------------------+----+-----+------------------+
|Egestas Nunc Asso...|     6.58|             a

                                                                                

In [21]:
a.createOrReplaceTempView("features")

e = spark.sql(""" 

SELECT merchant_name AS drop_name, FIRST(take_rate) AS take_rate, FIRST(revenue_levels) AS revenue_levels, FIRST(category) AS category,
    FIRST(total_males) AS males_in_SA2, FIRST(total_females) AS females_in_SA2, FIRST(income_per_persons) AS income_per_person
FROM features
GROUP BY merchant_name
""")

e.show(2)



+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|      drop_name|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|   A Associates|     4.95|             b|Books, Stationary...|        9762|         10846|22526.523772559674|
|A Felis Company|     4.32|             b|Books, Stationary...|        1080|          1051| 33927.61168708765|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
only showing top 2 rows



                                                                                

In [22]:
temp3.createOrReplaceTempView("edit")
e.createOrReplaceTempView("rates")

temp4 = spark.sql(""" 

SELECT *
FROM edit
INNER JOIN rates
ON edit.merchant_name = rates.drop_name
""")

train = temp4.drop('m_name', 'f_name', 'drop_name','join_col')

train.limit(5)

                                                                                

merchant_name,no_of_customers,SA2_code,Year,Month,total_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person
A Aliquet Ltd,2,312021351,2021,6,298.8537411028336,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Aliquet Ltd,2,401021010,2021,4,670.4857203238805,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Aliquet Ltd,2,603011065,2021,12,346.5799250465364,1,1,3.87,b,Furniture,3292,3206,28693.71558221812
A Arcu Industries,2,124011453,2021,8,203.5414742977921,1,1,3.0,c,Furniture,4821,4683,25816.03452631579
A Arcu Industries,2,211051282,2022,3,655.1195924003883,1,1,3.0,c,Furniture,4821,4683,25816.03452631579


In [23]:
train.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_customers: long (nullable = false)
 |-- SA2_code: long (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- total_earnings: double (nullable = true)
 |-- males: long (nullable = false)
 |-- females: long (nullable = false)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: long (nullable = true)
 |-- females_in_SA2: long (nullable = true)
 |-- income_per_person: double (nullable = true)



In [24]:
train_projection = train.select("merchant_name", "SA2_code", "Year", "Month", 'no_of_customers')
train_projection.limit(5)

                                                                                

merchant_name,SA2_code,Year,Month,no_of_customers
A Aliquet Ltd,401021010,2021,4,2
A Aliquet Ltd,603011065,2021,12,2
A Arcu Industries,124011453,2021,8,2
A Arcu Industries,211051282,2022,3,2
A Arcu Industries,214021379,2021,7,2


In [25]:
train_projection.count()

                                                                                

891622

In [26]:
train_projection = train_projection.withColumn("prev_year", \
              when(train_projection["Month"] == 1, train_projection['Year'] - 1).otherwise(train_projection['Year']))
train_projection = train_projection.withColumn("prev_month", \
              when(train_projection["Month"] == 1, 12).otherwise(train_projection['Month'] - 1))
train_projection = train_projection.drop("Year", "Month")
train_projection = train_projection.withColumnRenamed("no_of_customers", "future_customers") \
                            .withColumnRenamed("merchant_name", "p_merchant_name") \
                            .withColumnRenamed("SA2_code", "p_SA2_code")
train_projection.limit(5)

                                                                                

p_merchant_name,p_SA2_code,future_customers,prev_year,prev_month
A Aliquet Ltd,401021010,2,2021,3
A Aliquet Ltd,603011065,2,2021,11
A Arcu Industries,124011453,2,2021,7
A Arcu Industries,211051282,2,2022,2
A Arcu Industries,214021379,2,2021,6


In [27]:
train_projection.count()

                                                                                

891622

In [28]:
final_data = train.join(train_projection, (train.merchant_name == train_projection.p_merchant_name) & 
                           (train.SA2_code == train_projection.p_SA2_code) & 
                           (train.Year == train_projection.prev_year) & 
                           (train.Month == train_projection.prev_month), how = 'inner')

final_data = final_data.drop("p_merchant_name", "p_SA2_code","prev_year", "prev_month")
final_data.limit(5)

                                                                                

merchant_name,no_of_customers,SA2_code,Year,Month,total_earnings,males,females,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person,future_customers
A Auctor Non Corp...,3,202031033,2021,11,209.32764854012208,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,6
A Auctor Non Corp...,3,205021082,2022,7,262.10605847773724,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,3
A Auctor Non Corp...,2,205031087,2021,9,218.6890344781834,1,1,5.58,a,Furniture,2067,2014,22634.72370679088,3
A Auctor Non Corp...,3,210021235,2022,9,152.99495946306212,1,2,5.58,a,Furniture,2067,2014,22634.72370679088,3
A Auctor Non Corp...,4,211051282,2021,10,298.778609499402,2,2,5.58,a,Furniture,2067,2014,22634.72370679088,5


In [29]:
final_data = final_data.withColumn('Year',

    F.col('Year').cast('STRING')

)

final_data = final_data.withColumn('Month',

    F.col('Month').cast('STRING')

)

final_data = final_data.withColumn('SA2_code',

    F.col('SA2_code').cast('STRING')

)

field = ['future_customers','no_of_customers' ,'males', 'females', 'males_in_SA2', 'females_in_SA2']

for col in field:
    final_data = final_data.withColumn(col,

    F.col(col).cast('INT')

)

In [30]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(final_data).transform(final_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2','no_of_customers' ,'income_per_person','take_rate', 'total_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

                                                                                

In [31]:
# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_customers",
    "label"
)

In [32]:
# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [33]:
# Split the data into training and validation sets (30% held out for testing)

trainingData, testData = outdata1.randomSplit([0.7, 0.3], seed = 20)

In [34]:
trainingData.count(), testData.count()



22/10/07 17:05:10 WARN DAGScheduler: Broadcasting large task binary with size 1447.5 KiB




22/10/07 17:06:35 WARN DAGScheduler: Broadcasting large task binary with size 1447.5 KiB


                                                                                

(240534, 103485)

In [35]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")


# Train model.  
model = rf.fit(trainingData)

# Make predictions.
predictions_validation = model.transform(testData)



22/10/07 17:07:57 WARN DAGScheduler: Broadcasting large task binary with size 1447.4 KiB


                                                                                

22/10/07 17:07:58 WARN DAGScheduler: Broadcasting large task binary with size 1447.5 KiB


                                                                                

22/10/07 17:08:00 WARN DAGScheduler: Broadcasting large task binary with size 1451.5 KiB


                                                                                

22/10/07 17:08:02 WARN DAGScheduler: Broadcasting large task binary with size 1564.2 KiB


                                                                                

22/10/07 17:08:09 WARN DAGScheduler: Broadcasting large task binary with size 1625.7 KiB


                                                                                

22/10/07 17:08:12 WARN DAGScheduler: Broadcasting large task binary with size 1746.6 KiB


                                                                                

22/10/07 17:08:16 WARN DAGScheduler: Broadcasting large task binary with size 1985.6 KiB


                                                                                

22/10/07 17:08:21 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB


                                                                                

In [36]:
# Evaluate the validation set 

predictions_validation.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error

evaluator_train_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_train = evaluator_train_rmse.evaluate(predictions_validation)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_train)

evaluator_train_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae_train = evaluator_train_mae.evaluate(predictions_validation)
print("Mean Absolutee Error (MAE) on train data = %g" % mae_train)



22/10/07 17:09:46 WARN DAGScheduler: Broadcasting large task binary with size 1461.7 KiB


                                                                                

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 3.995871420804393|    4|(2085,[209,1093,2...|
| 4.105122129941754|    6|(2085,[209,988,20...|
| 4.105122129941754|    4|(2085,[209,976,20...|
| 4.416280714838842|    5|(2085,[209,989,20...|
|4.5805087950826735|    3|(2085,[209,1039,2...|
+------------------+-----+--------------------+
only showing top 5 rows





22/10/07 17:11:10 WARN DAGScheduler: Broadcasting large task binary with size 1452.7 KiB


                                                                                

22/10/07 17:11:12 WARN DAGScheduler: Broadcasting large task binary with size 1453.8 KiB
Root Mean Squared Error (RMSE) on train data = 2.95724




22/10/07 17:12:31 WARN DAGScheduler: Broadcasting large task binary with size 1452.7 KiB


[Stage 2346:>                                                       (0 + 8) / 9]

22/10/07 17:12:33 WARN DAGScheduler: Broadcasting large task binary with size 1453.8 KiB
Mean Absolutee Error (MAE) on train data = 2.0985


                                                                                

In [37]:
def ExtractFeatureImportance(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))
  
  
#ExtractFeatureImportance(model.stages[-1].featureImportances, dataset, "features")
dataset_fi = ExtractFeatureImportance(model.featureImportances, predictions_validation, "features")
dataset_fi = spark.createDataFrame(dataset_fi)
display(dataset_fi)

idx,name,score
2081,no_of_customers,0.7728913228246239
2084,total_earnings,0.12018637343537428
2082,income_per_person,0.01819117331506711
0,merchant_name_vec...,0.010906124748100212
2,merchant_name_vec...,0.009304060195289543
2079,males_in_SA2,0.00766412157028098
2066,Month_vec_12,0.006715216883223568
2083,take_rate,0.006413892437917029
1,merchant_name_vec...,0.005814341361763084
2080,females_in_SA2,0.00550944865680363


## Future predictions

In [38]:
latest_year = train.select(max('Year')).collect()[0][0]
agg_month_1 = train.filter(train.Year == latest_year)
latest_month = agg_month_1.select(max('Month')).collect()[0][0]
predicting_data = agg_month_1.filter(train.Month == latest_month)
predicting_data = predicting_data.withColumn("future_customers", lit(0))
predicting_data.limit(5)

AnalysisException: Column 'r' does not exist. Did you mean one of the following? [edit.Year, edit.Month, edit.males, edit.females, edit.SA2_code, rates.category, rates.take_rate, rates.males_in_SA2, edit.merchant_name, edit.total_earnings, rates.females_in_SA2, edit.no_of_customers, rates.revenue_levels, rates.income_per_person];
'Project ['r]
+- Project [merchant_name#193, no_of_customers#2750L, SA2_code#608L, Year#1749, Month#1785, total_earnings#2751, males#2487L, females#2539L, take_rate#3842, revenue_levels#3844, category#3846, males_in_SA2#3848L, females_in_SA2#3850L, income_per_person#3852]
   +- Project [merchant_name#193, no_of_customers#2750L, SA2_code#608L, Year#1749, Month#1785, total_earnings#2751, join_col#2752, m_name#2486, males#2487L, f_name#2538, females#2539L, drop_name#3840, take_rate#3842, revenue_levels#3844, category#3846, males_in_SA2#3848L, females_in_SA2#3850L, income_per_person#3852]
      +- Join Inner, (merchant_name#193 = drop_name#3840)
         :- SubqueryAlias edit
         :  +- View (`edit`, [merchant_name#193,no_of_customers#2750L,SA2_code#608L,Year#1749,Month#1785,total_earnings#2751,join_col#2752,m_name#2486,males#2487L,f_name#2538,females#2539L])
         :     +- Project [merchant_name#193, no_of_customers#2750L, SA2_code#608L, Year#1749, Month#1785, total_earnings#2751, join_col#2752, m_name#2486, males#2487L, f_name#2538, females#2539L]
         :        +- Join Inner, (join_col#2752 = f_name#2538)
         :           :- SubqueryAlias temp2
         :           :  +- View (`temp2`, [merchant_name#193,no_of_customers#2750L,SA2_code#608L,Year#1749,Month#1785,total_earnings#2751,join_col#2752,m_name#2486,males#2487L])
         :           :     +- Project [merchant_name#193, no_of_customers#2750L, SA2_code#608L, Year#1749, Month#1785, total_earnings#2751, join_col#2752, m_name#2486, males#2487L]
         :           :        +- Join Inner, (join_col#2752 = m_name#2486)
         :           :           :- SubqueryAlias gender_join
         :           :           :  +- View (`gender_join`, [merchant_name#193,no_of_customers#2750L,SA2_code#608L,Year#1749,Month#1785,total_earnings#2751,join_col#2752])
         :           :           :     +- Aggregate [merchant_name#193, SA2_code#608L, Year#1749, Month#1785], [merchant_name#193, count(distinct user_id#35L) AS no_of_customers#2750L, SA2_code#608L, Year#1749, Month#1785, sum((dollar_value#41 - percent#1714)) AS total_earnings#2751, concat(merchant_name#193, cast(SA2_code#608L as string), cast(Year#1749 as string), cast(Month#1785 as string)) AS join_col#2752]
         :           :           :        +- SubqueryAlias agg
         :           :           :           +- View (`agg`, [merchant_name#193,take_rate#175,revenue_levels#181,state#19,gender#21,dollar_value#41,user_id#35L,suburb#315,postcodes#316,long#247,lat#248,SA2_code#608L,income_2018-2019#610L,total_males#611L,total_females#612L,total_persons#613L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#1561,percent#1714,Year#1749,Month#1785])
         :           :           :              +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#19, gender#21, dollar_value#41, user_id#35L, suburb#315, postcodes#316, long#247, lat#248, SA2_code#608L, income_2018-2019#610L, total_males#611L, total_females#612L, total_persons#613L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#1561, percent#1714, Year#1749, Month#1785]
         :           :           :                 +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 12 more fields]
         :           :           :                    +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 11 more fields]
         :           :           :                       +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 10 more fields]
         :           :           :                          +- SubqueryAlias group
         :           :           :                             +- View (`group`, [merchant_name#193,merchant_abn#31L,categories#170,take_rate#175,revenue_levels#181,name#17,address#18,state#19,gender#21,trans_merchant_abn#214L,dollar_value#41,order_id#42,order_datetime#43,user_id#35L,consumer_id#36L,suburb#315,postcodes#316,long#247,lat#248,int_sa2#566,SA2_code#608L,SA2_name#609,income_2018-2019#610L,total_males#611L,total_females#612L,total_persons#613L,state_code#614L,state_name#615,population_2020#616L,population_2021#617L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#1561])
         :           :           :                                +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 9 more fields]
         :           :           :                                   +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 10 more fields]
         :           :           :                                      +- Join Inner, (merchant_abn#31L = tagged_merchant_abn#1564L)
         :           :           :                                         :- SubqueryAlias join
         :           :           :                                         :  +- View (`join`, [merchant_name#193,merchant_abn#31L,categories#170,take_rate#175,revenue_levels#181,name#17,address#18,state#19,gender#21,trans_merchant_abn#214L,dollar_value#41,order_id#42,order_datetime#43,user_id#35L,consumer_id#36L,suburb#315,postcodes#316,long#247,lat#248,int_sa2#566,SA2_code#608L,SA2_name#609,income_2018-2019#610L,total_males#611L,total_females#612L,total_persons#613L,state_code#614L,state_name#615,population_2020#616L,population_2021#617L,fraud_probability_consumer#1080,fraud_probability_merchant#1081])
         :           :           :                                         :     +- Filter isnotnull(gender#21)
         :           :           :                                         :        +- Filter isnotnull(consumer_id#36L)
         :           :           :                                         :           +- Filter (dollar_value#41 > cast(0 as double))
         :           :           :                                         :              +- Filter isnotnull(merchant_abn#31L)
         :           :           :                                         :                 +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 8 more fields]
         :           :           :                                         :                    +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 8 more fields]
         :           :           :                                         :                       +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 8 more fields]
         :           :           :                                         :                          +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 8 more fields]
         :           :           :                                         :                             +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 8 more fields]
         :           :           :                                         :                                +- Join LeftOuter, ((merchant_abn#31L = cast(merchant_abn#119 as bigint)) AND (order_datetime#43 = cast(order_datetime#120 as date)))
         :           :           :                                         :                                   :- SubqueryAlias a
         :           :           :                                         :                                   :  +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 7 more fields]
         :           :           :                                         :                                   :     +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 7 more fields]
         :           :           :                                         :                                   :        +- Join LeftOuter, ((user_id#35L = cast(user_id#96 as bigint)) AND (order_datetime#43 = cast(order_datetime#97 as date)))
         :           :           :                                         :                                   :           :- SubqueryAlias a
         :           :           :                                         :                                   :           :  +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566, SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, ... 6 more fields]
         :           :           :                                         :                                   :           :     +- Join Inner, (cast(int_sa2#566 as bigint) = SA2_code#608L)
         :           :           :                                         :                                   :           :        :- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, long#247, lat#248, int_sa2#566]
         :           :           :                                         :                                   :           :        :  +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, sa2#317, long#247, lat#248, cast(sa2#317 as int) AS int_sa2#566]
         :           :           :                                         :                                   :           :        :     +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L, suburb#315, postcodes#316, sa2#317, long#247, lat#248]
         :           :           :                                         :                                   :           :        :        +- Join Inner, (postcode#20 = postcodes#316)
         :           :           :                                         :                                   :           :        :           :- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, postcode#20, gender#21, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L]
         :           :           :                                         :                                   :           :        :           :  +- Project [merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181, name#17, address#18, state#19, postcode#20, gender#21, trans_user_id#220L, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43, user_id#35L, consumer_id#36L]
         :           :           :                                         :                                   :           :        :           :     +- Join FullOuter, (merchant_abn#31L = trans_merchant_abn#214L)
         :           :           :                                         :                                   :           :        :           :        :- Project [name#29 AS merchant_name#193, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181]
         :           :           :                                         :                                   :           :        :           :        :  +- Project [name#29, merchant_abn#31L, categories#170, take_rate#175, revenue_levels#181]
         :           :           :                                         :                                   :           :        :           :        :     +- Project [name#29, tags#166, merchant_abn#31L, categories#170, take_rate#175, split(tags#166, [)\]], [\[(], -1)[1] AS revenue_levels#181]
         :           :           :                                         :                                   :           :        :           :        :        +- Project [name#29, tags#166, merchant_abn#31L, categories#170, cast(split(tags#166, [)\]], [\[(]take rate: , -1)[1] as double) AS take_rate#175]
         :           :           :                                         :                                   :           :        :           :        :           +- Project [name#29, tags#166, merchant_abn#31L, split(tags#166, [)\]], [\[(], -1)[0] AS categories#170]
         :           :           :                                         :                                   :           :        :           :        :              +- Project [name#29, regexp_replace(tags#161, [\[(][\[(], , 1) AS tags#166, merchant_abn#31L]
         :           :           :                                         :                                   :           :        :           :        :                 +- Project [name#29, regexp_replace(tags#30, [\])][\])], , 1) AS tags#161, merchant_abn#31L]
         :           :           :                                         :                                   :           :        :           :        :                    +- Relation [name#29,tags#30,merchant_abn#31L] parquet
         :           :           :                                         :                                   :           :        :           :        +- Join Inner, (int_consumer_id#199L = consumer_id#36L)
         :           :           :                                         :                                   :           :        :           :           :- Project [name#17, address#18, state#19, postcode#20, gender#21, int_consumer_id#199L]
         :           :           :                                         :                                   :           :        :           :           :  +- Project [name#17, address#18, state#19, postcode#20, gender#21, consumer_id#22, cast(consumer_id#22 as bigint) AS int_consumer_id#199L]
         :           :           :                                         :                                   :           :        :           :           :     +- Relation [name#17,address#18,state#19,postcode#20,gender#21,consumer_id#22] csv
         :           :           :                                         :                                   :           :        :           :           +- Join Inner, (trans_user_id#220L = user_id#35L)
         :           :           :                                         :                                   :           :        :           :              :- Project [user_id#39L AS trans_user_id#220L, trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43]
         :           :           :                                         :                                   :           :        :           :              :  +- Project [user_id#39L, merchant_abn#40L AS trans_merchant_abn#214L, dollar_value#41, order_id#42, order_datetime#43]
         :           :           :                                         :                                   :           :        :           :              :     +- Sort [order_datetime#43 ASC NULLS FIRST], true
         :           :           :                                         :                                   :           :        :           :              :        +- Deduplicate [order_id#42, order_datetime#43, user_id#39L, dollar_value#41, merchant_abn#40L]
         :           :           :                                         :                                   :           :        :           :              :           +- Union false, false
         :           :           :                                         :                                   :           :        :           :              :              :- Deduplicate [order_id#42, order_datetime#43, user_id#39L, dollar_value#41, merchant_abn#40L]
         :           :           :                                         :                                   :           :        :           :              :              :  +- Union false, false
         :           :           :                                         :                                   :           :        :           :              :              :     :- Relation [user_id#39L,merchant_abn#40L,dollar_value#41,order_id#42,order_datetime#43] parquet
         :           :           :                                         :                                   :           :        :           :              :              :     +- Relation [user_id#49L,merchant_abn#50L,dollar_value#51,order_id#52,order_datetime#53] parquet
         :           :           :                                         :                                   :           :        :           :              :              +- Relation [user_id#59L,merchant_abn#60L,dollar_value#61,order_id#62,order_datetime#63] parquet
         :           :           :                                         :                                   :           :        :           :              +- Relation [user_id#35L,consumer_id#36L] parquet
         :           :           :                                         :                                   :           :        :           +- Deduplicate [postcodes#316]
         :           :           :                                         :                                   :           :        :              +- Filter atleastnnonnulls(5, suburb#315, postcodes#316, sa2#317, long#247, lat#248)
         :           :           :                                         :                                   :           :        :                 +- Project [locality#245 AS suburb#315, postcode#244 AS postcodes#316, SA2_MAINCODE_2016#261 AS sa2#317, long#247, lat#248]
         :           :           :                                         :                                   :           :        :                    +- Relation [id#243,postcode#244,locality#245,state#246,long#247,lat#248,dc#249,type#250,status#251,sa3#252,sa3name#253,sa4#254,sa4name#255,region#256,Lat_precise#257,Long_precise#258,SA1_MAINCODE_2011#259,SA1_MAINCODE_2016#260,SA2_MAINCODE_2016#261,SA2_NAME_2016#262,SA3_CODE_2016#263,SA3_NAME_2016#264,SA4_CODE_2016#265,SA4_NAME_2016#266,... 12 more fields] csv
         :           :           :                                         :                                   :           :        +- LogicalRDD [SA2_code#608L, SA2_name#609, income_2018-2019#610L, total_males#611L, total_females#612L, total_persons#613L, state_code#614L, state_name#615, population_2020#616L, population_2021#617L], false
         :           :           :                                         :                                   :           +- SubqueryAlias b
         :           :           :                                         :                                   :              +- Relation [user_id#96,order_datetime#97,fraud_probability#98] csv
         :           :           :                                         :                                   +- SubqueryAlias b
         :           :           :                                         :                                      +- Relation [merchant_abn#119,order_datetime#120,fraud_probability#121] csv
         :           :           :                                         +- SubqueryAlias tagged
         :           :           :                                            +- View (`tagged`, [tagged_merchant_abn#1564L,category#1561])
         :           :           :                                               +- Project [merchant_abn#1560L AS tagged_merchant_abn#1564L, category#1561]
         :           :           :                                                  +- Relation [merchant_abn#1560L,category#1561] parquet
         :           :           +- SubqueryAlias m
         :           :              +- View (`m`, [m_name#2486,males#2487L])
         :           :                 +- Aggregate [merchant_name#193, SA2_code#2915L, Year#1749, Month#1785], [concat(merchant_name#193, cast(SA2_code#2915L as string), cast(Year#1749 as string), cast(Month#1785 as string)) AS m_name#2486, count(gender#2860) AS males#2487L]
         :           :                    +- Filter (gender#2860 = Male)
         :           :                       +- SubqueryAlias agg
         :           :                          +- View (`agg`, [merchant_name#193,take_rate#175,revenue_levels#181,state#2858,gender#2860,dollar_value#2864,user_id#2877L,suburb#315,postcodes#316,long#2883,lat#2884,SA2_code#2915L,income_2018-2019#2917L,total_males#2918L,total_females#2919L,total_persons#2920L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#2932,percent#1714,Year#1749,Month#1785])
         :           :                             +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#2858, gender#2860, dollar_value#2864, user_id#2877L, suburb#315, postcodes#316, long#2883, lat#2884, SA2_code#2915L, income_2018-2019#2917L, total_males#2918L, total_females#2919L, total_persons#2920L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#2932, percent#1714, Year#1749, Month#1785]
         :           :                                +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 12 more fields]
         :           :                                   +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 11 more fields]
         :           :                                      +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 10 more fields]
         :           :                                         +- SubqueryAlias group
         :           :                                            +- View (`group`, [merchant_name#193,merchant_abn#2855L,categories#170,take_rate#175,revenue_levels#181,name#2856,address#2857,state#2858,gender#2860,trans_merchant_abn#214L,dollar_value#2864,order_id#2865,order_datetime#2866,user_id#2877L,consumer_id#2878L,suburb#315,postcodes#316,long#2883,lat#2884,int_sa2#566,SA2_code#2915L,SA2_name#2916,income_2018-2019#2917L,total_males#2918L,total_females#2919L,total_persons#2920L,state_code#2921L,state_name#2922,population_2020#2923L,population_2021#2924L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#2932])
         :           :                                               +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 9 more fields]
         :           :                                                  +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 10 more fields]
         :           :                                                     +- Join Inner, (merchant_abn#2855L = tagged_merchant_abn#1564L)
         :           :                                                        :- SubqueryAlias join
         :           :                                                        :  +- View (`join`, [merchant_name#193,merchant_abn#2855L,categories#170,take_rate#175,revenue_levels#181,name#2856,address#2857,state#2858,gender#2860,trans_merchant_abn#214L,dollar_value#2864,order_id#2865,order_datetime#2866,user_id#2877L,consumer_id#2878L,suburb#315,postcodes#316,long#2883,lat#2884,int_sa2#566,SA2_code#2915L,SA2_name#2916,income_2018-2019#2917L,total_males#2918L,total_females#2919L,total_persons#2920L,state_code#2921L,state_name#2922,population_2020#2923L,population_2021#2924L,fraud_probability_consumer#1080,fraud_probability_merchant#1081])
         :           :                                                        :     +- Filter isnotnull(gender#2860)
         :           :                                                        :        +- Filter isnotnull(consumer_id#2878L)
         :           :                                                        :           +- Filter (dollar_value#2864 > cast(0 as double))
         :           :                                                        :              +- Filter isnotnull(merchant_abn#2855L)
         :           :                                                        :                 +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 8 more fields]
         :           :                                                        :                    +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 8 more fields]
         :           :                                                        :                       +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 8 more fields]
         :           :                                                        :                          +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 8 more fields]
         :           :                                                        :                             +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 8 more fields]
         :           :                                                        :                                +- Join LeftOuter, ((merchant_abn#2855L = cast(merchant_abn#2928 as bigint)) AND (order_datetime#2866 = cast(order_datetime#2929 as date)))
         :           :                                                        :                                   :- SubqueryAlias a
         :           :                                                        :                                   :  +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 7 more fields]
         :           :                                                        :                                   :     +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 7 more fields]
         :           :                                                        :                                   :        +- Join LeftOuter, ((user_id#2877L = cast(user_id#2925 as bigint)) AND (order_datetime#2866 = cast(order_datetime#2926 as date)))
         :           :                                                        :                                   :           :- SubqueryAlias a
         :           :                                                        :                                   :           :  +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566, SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, ... 6 more fields]
         :           :                                                        :                                   :           :     +- Join Inner, (cast(int_sa2#566 as bigint) = SA2_code#2915L)
         :           :                                                        :                                   :           :        :- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, long#2883, lat#2884, int_sa2#566]
         :           :                                                        :                                   :           :        :  +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, sa2#317, long#2883, lat#2884, cast(sa2#317 as int) AS int_sa2#566]
         :           :                                                        :                                   :           :        :     +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L, suburb#315, postcodes#316, sa2#317, long#2883, lat#2884]
         :           :                                                        :                                   :           :        :        +- Join Inner, (postcode#2859 = postcodes#316)
         :           :                                                        :                                   :           :        :           :- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, postcode#2859, gender#2860, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L]
         :           :                                                        :                                   :           :        :           :  +- Project [merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181, name#2856, address#2857, state#2858, postcode#2859, gender#2860, trans_user_id#220L, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866, user_id#2877L, consumer_id#2878L]
         :           :                                                        :                                   :           :        :           :     +- Join FullOuter, (merchant_abn#2855L = trans_merchant_abn#214L)
         :           :                                                        :                                   :           :        :           :        :- Project [name#2853 AS merchant_name#193, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181]
         :           :                                                        :                                   :           :        :           :        :  +- Project [name#2853, merchant_abn#2855L, categories#170, take_rate#175, revenue_levels#181]
         :           :                                                        :                                   :           :        :           :        :     +- Project [name#2853, tags#166, merchant_abn#2855L, categories#170, take_rate#175, split(tags#166, [)\]], [\[(], -1)[1] AS revenue_levels#181]
         :           :                                                        :                                   :           :        :           :        :        +- Project [name#2853, tags#166, merchant_abn#2855L, categories#170, cast(split(tags#166, [)\]], [\[(]take rate: , -1)[1] as double) AS take_rate#175]
         :           :                                                        :                                   :           :        :           :        :           +- Project [name#2853, tags#166, merchant_abn#2855L, split(tags#166, [)\]], [\[(], -1)[0] AS categories#170]
         :           :                                                        :                                   :           :        :           :        :              +- Project [name#2853, regexp_replace(tags#161, [\[(][\[(], , 1) AS tags#166, merchant_abn#2855L]
         :           :                                                        :                                   :           :        :           :        :                 +- Project [name#2853, regexp_replace(tags#2854, [\])][\])], , 1) AS tags#161, merchant_abn#2855L]
         :           :                                                        :                                   :           :        :           :        :                    +- Relation [name#2853,tags#2854,merchant_abn#2855L] parquet
         :           :                                                        :                                   :           :        :           :        +- Join Inner, (int_consumer_id#199L = consumer_id#2878L)
         :           :                                                        :                                   :           :        :           :           :- Project [name#2856, address#2857, state#2858, postcode#2859, gender#2860, int_consumer_id#199L]
         :           :                                                        :                                   :           :        :           :           :  +- Project [name#2856, address#2857, state#2858, postcode#2859, gender#2860, consumer_id#2861, cast(consumer_id#2861 as bigint) AS int_consumer_id#199L]
         :           :                                                        :                                   :           :        :           :           :     +- Relation [name#2856,address#2857,state#2858,postcode#2859,gender#2860,consumer_id#2861] csv
         :           :                                                        :                                   :           :        :           :           +- Join Inner, (trans_user_id#220L = user_id#2877L)
         :           :                                                        :                                   :           :        :           :              :- Project [user_id#2862L AS trans_user_id#220L, trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866]
         :           :                                                        :                                   :           :        :           :              :  +- Project [user_id#2862L, merchant_abn#2863L AS trans_merchant_abn#214L, dollar_value#2864, order_id#2865, order_datetime#2866]
         :           :                                                        :                                   :           :        :           :              :     +- Sort [order_datetime#2866 ASC NULLS FIRST], true
         :           :                                                        :                                   :           :        :           :              :        +- Deduplicate [order_id#2865, order_datetime#2866, user_id#2862L, dollar_value#2864, merchant_abn#2863L]
         :           :                                                        :                                   :           :        :           :              :           +- Union false, false
         :           :                                                        :                                   :           :        :           :              :              :- Deduplicate [order_id#2865, order_datetime#2866, user_id#2862L, dollar_value#2864, merchant_abn#2863L]
         :           :                                                        :                                   :           :        :           :              :              :  +- Union false, false
         :           :                                                        :                                   :           :        :           :              :              :     :- Relation [user_id#2862L,merchant_abn#2863L,dollar_value#2864,order_id#2865,order_datetime#2866] parquet
         :           :                                                        :                                   :           :        :           :              :              :     +- Relation [user_id#2867L,merchant_abn#2868L,dollar_value#2869,order_id#2870,order_datetime#2871] parquet
         :           :                                                        :                                   :           :        :           :              :              +- Relation [user_id#2872L,merchant_abn#2873L,dollar_value#2874,order_id#2875,order_datetime#2876] parquet
         :           :                                                        :                                   :           :        :           :              +- Relation [user_id#2877L,consumer_id#2878L] parquet
         :           :                                                        :                                   :           :        :           +- Deduplicate [postcodes#316]
         :           :                                                        :                                   :           :        :              +- Filter atleastnnonnulls(5, suburb#315, postcodes#316, sa2#317, long#2883, lat#2884)
         :           :                                                        :                                   :           :        :                 +- Project [locality#2881 AS suburb#315, postcode#2880 AS postcodes#316, SA2_MAINCODE_2016#2897 AS sa2#317, long#2883, lat#2884]
         :           :                                                        :                                   :           :        :                    +- Relation [id#2879,postcode#2880,locality#2881,state#2882,long#2883,lat#2884,dc#2885,type#2886,status#2887,sa3#2888,sa3name#2889,sa4#2890,sa4name#2891,region#2892,Lat_precise#2893,Long_precise#2894,SA1_MAINCODE_2011#2895,SA1_MAINCODE_2016#2896,SA2_MAINCODE_2016#2897,SA2_NAME_2016#2898,SA3_CODE_2016#2899,SA3_NAME_2016#2900,SA4_CODE_2016#2901,SA4_NAME_2016#2902,... 12 more fields] csv
         :           :                                                        :                                   :           :        +- LogicalRDD [SA2_code#2915L, SA2_name#2916, income_2018-2019#2917L, total_males#2918L, total_females#2919L, total_persons#2920L, state_code#2921L, state_name#2922, population_2020#2923L, population_2021#2924L], false
         :           :                                                        :                                   :           +- SubqueryAlias b
         :           :                                                        :                                   :              +- Relation [user_id#2925,order_datetime#2926,fraud_probability#2927] csv
         :           :                                                        :                                   +- SubqueryAlias b
         :           :                                                        :                                      +- Relation [merchant_abn#2928,order_datetime#2929,fraud_probability#2930] csv
         :           :                                                        +- SubqueryAlias tagged
         :           :                                                           +- View (`tagged`, [tagged_merchant_abn#1564L,category#2932])
         :           :                                                              +- Project [merchant_abn#2931L AS tagged_merchant_abn#1564L, category#2932]
         :           :                                                                 +- Relation [merchant_abn#2931L,category#2932] parquet
         :           +- SubqueryAlias f
         :              +- View (`f`, [f_name#2538,females#2539L])
         :                 +- Aggregate [merchant_name#193, SA2_code#3011L, Year#1749, Month#1785], [concat(merchant_name#193, cast(SA2_code#3011L as string), cast(Year#1749 as string), cast(Month#1785 as string)) AS f_name#2538, count(gender#2956) AS females#2539L]
         :                    +- Filter (gender#2956 = Female)
         :                       +- SubqueryAlias agg
         :                          +- View (`agg`, [merchant_name#193,take_rate#175,revenue_levels#181,state#2954,gender#2956,dollar_value#2960,user_id#2973L,suburb#315,postcodes#316,long#2979,lat#2980,SA2_code#3011L,income_2018-2019#3013L,total_males#3014L,total_females#3015L,total_persons#3016L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#3028,percent#1714,Year#1749,Month#1785])
         :                             +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#2954, gender#2956, dollar_value#2960, user_id#2973L, suburb#315, postcodes#316, long#2979, lat#2980, SA2_code#3011L, income_2018-2019#3013L, total_males#3014L, total_females#3015L, total_persons#3016L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#3028, percent#1714, Year#1749, Month#1785]
         :                                +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 12 more fields]
         :                                   +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 11 more fields]
         :                                      +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 10 more fields]
         :                                         +- SubqueryAlias group
         :                                            +- View (`group`, [merchant_name#193,merchant_abn#2951L,categories#170,take_rate#175,revenue_levels#181,name#2952,address#2953,state#2954,gender#2956,trans_merchant_abn#214L,dollar_value#2960,order_id#2961,order_datetime#2962,user_id#2973L,consumer_id#2974L,suburb#315,postcodes#316,long#2979,lat#2980,int_sa2#566,SA2_code#3011L,SA2_name#3012,income_2018-2019#3013L,total_males#3014L,total_females#3015L,total_persons#3016L,state_code#3017L,state_name#3018,population_2020#3019L,population_2021#3020L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#3028])
         :                                               +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 9 more fields]
         :                                                  +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 10 more fields]
         :                                                     +- Join Inner, (merchant_abn#2951L = tagged_merchant_abn#1564L)
         :                                                        :- SubqueryAlias join
         :                                                        :  +- View (`join`, [merchant_name#193,merchant_abn#2951L,categories#170,take_rate#175,revenue_levels#181,name#2952,address#2953,state#2954,gender#2956,trans_merchant_abn#214L,dollar_value#2960,order_id#2961,order_datetime#2962,user_id#2973L,consumer_id#2974L,suburb#315,postcodes#316,long#2979,lat#2980,int_sa2#566,SA2_code#3011L,SA2_name#3012,income_2018-2019#3013L,total_males#3014L,total_females#3015L,total_persons#3016L,state_code#3017L,state_name#3018,population_2020#3019L,population_2021#3020L,fraud_probability_consumer#1080,fraud_probability_merchant#1081])
         :                                                        :     +- Filter isnotnull(gender#2956)
         :                                                        :        +- Filter isnotnull(consumer_id#2974L)
         :                                                        :           +- Filter (dollar_value#2960 > cast(0 as double))
         :                                                        :              +- Filter isnotnull(merchant_abn#2951L)
         :                                                        :                 +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 8 more fields]
         :                                                        :                    +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 8 more fields]
         :                                                        :                       +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 8 more fields]
         :                                                        :                          +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 8 more fields]
         :                                                        :                             +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 8 more fields]
         :                                                        :                                +- Join LeftOuter, ((merchant_abn#2951L = cast(merchant_abn#3024 as bigint)) AND (order_datetime#2962 = cast(order_datetime#3025 as date)))
         :                                                        :                                   :- SubqueryAlias a
         :                                                        :                                   :  +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 7 more fields]
         :                                                        :                                   :     +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 7 more fields]
         :                                                        :                                   :        +- Join LeftOuter, ((user_id#2973L = cast(user_id#3021 as bigint)) AND (order_datetime#2962 = cast(order_datetime#3022 as date)))
         :                                                        :                                   :           :- SubqueryAlias a
         :                                                        :                                   :           :  +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566, SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, ... 6 more fields]
         :                                                        :                                   :           :     +- Join Inner, (cast(int_sa2#566 as bigint) = SA2_code#3011L)
         :                                                        :                                   :           :        :- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, long#2979, lat#2980, int_sa2#566]
         :                                                        :                                   :           :        :  +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, sa2#317, long#2979, lat#2980, cast(sa2#317 as int) AS int_sa2#566]
         :                                                        :                                   :           :        :     +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L, suburb#315, postcodes#316, sa2#317, long#2979, lat#2980]
         :                                                        :                                   :           :        :        +- Join Inner, (postcode#2955 = postcodes#316)
         :                                                        :                                   :           :        :           :- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, postcode#2955, gender#2956, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L]
         :                                                        :                                   :           :        :           :  +- Project [merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181, name#2952, address#2953, state#2954, postcode#2955, gender#2956, trans_user_id#220L, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962, user_id#2973L, consumer_id#2974L]
         :                                                        :                                   :           :        :           :     +- Join FullOuter, (merchant_abn#2951L = trans_merchant_abn#214L)
         :                                                        :                                   :           :        :           :        :- Project [name#2949 AS merchant_name#193, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181]
         :                                                        :                                   :           :        :           :        :  +- Project [name#2949, merchant_abn#2951L, categories#170, take_rate#175, revenue_levels#181]
         :                                                        :                                   :           :        :           :        :     +- Project [name#2949, tags#166, merchant_abn#2951L, categories#170, take_rate#175, split(tags#166, [)\]], [\[(], -1)[1] AS revenue_levels#181]
         :                                                        :                                   :           :        :           :        :        +- Project [name#2949, tags#166, merchant_abn#2951L, categories#170, cast(split(tags#166, [)\]], [\[(]take rate: , -1)[1] as double) AS take_rate#175]
         :                                                        :                                   :           :        :           :        :           +- Project [name#2949, tags#166, merchant_abn#2951L, split(tags#166, [)\]], [\[(], -1)[0] AS categories#170]
         :                                                        :                                   :           :        :           :        :              +- Project [name#2949, regexp_replace(tags#161, [\[(][\[(], , 1) AS tags#166, merchant_abn#2951L]
         :                                                        :                                   :           :        :           :        :                 +- Project [name#2949, regexp_replace(tags#2950, [\])][\])], , 1) AS tags#161, merchant_abn#2951L]
         :                                                        :                                   :           :        :           :        :                    +- Relation [name#2949,tags#2950,merchant_abn#2951L] parquet
         :                                                        :                                   :           :        :           :        +- Join Inner, (int_consumer_id#199L = consumer_id#2974L)
         :                                                        :                                   :           :        :           :           :- Project [name#2952, address#2953, state#2954, postcode#2955, gender#2956, int_consumer_id#199L]
         :                                                        :                                   :           :        :           :           :  +- Project [name#2952, address#2953, state#2954, postcode#2955, gender#2956, consumer_id#2957, cast(consumer_id#2957 as bigint) AS int_consumer_id#199L]
         :                                                        :                                   :           :        :           :           :     +- Relation [name#2952,address#2953,state#2954,postcode#2955,gender#2956,consumer_id#2957] csv
         :                                                        :                                   :           :        :           :           +- Join Inner, (trans_user_id#220L = user_id#2973L)
         :                                                        :                                   :           :        :           :              :- Project [user_id#2958L AS trans_user_id#220L, trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962]
         :                                                        :                                   :           :        :           :              :  +- Project [user_id#2958L, merchant_abn#2959L AS trans_merchant_abn#214L, dollar_value#2960, order_id#2961, order_datetime#2962]
         :                                                        :                                   :           :        :           :              :     +- Sort [order_datetime#2962 ASC NULLS FIRST], true
         :                                                        :                                   :           :        :           :              :        +- Deduplicate [order_id#2961, order_datetime#2962, user_id#2958L, dollar_value#2960, merchant_abn#2959L]
         :                                                        :                                   :           :        :           :              :           +- Union false, false
         :                                                        :                                   :           :        :           :              :              :- Deduplicate [order_id#2961, order_datetime#2962, user_id#2958L, dollar_value#2960, merchant_abn#2959L]
         :                                                        :                                   :           :        :           :              :              :  +- Union false, false
         :                                                        :                                   :           :        :           :              :              :     :- Relation [user_id#2958L,merchant_abn#2959L,dollar_value#2960,order_id#2961,order_datetime#2962] parquet
         :                                                        :                                   :           :        :           :              :              :     +- Relation [user_id#2963L,merchant_abn#2964L,dollar_value#2965,order_id#2966,order_datetime#2967] parquet
         :                                                        :                                   :           :        :           :              :              +- Relation [user_id#2968L,merchant_abn#2969L,dollar_value#2970,order_id#2971,order_datetime#2972] parquet
         :                                                        :                                   :           :        :           :              +- Relation [user_id#2973L,consumer_id#2974L] parquet
         :                                                        :                                   :           :        :           +- Deduplicate [postcodes#316]
         :                                                        :                                   :           :        :              +- Filter atleastnnonnulls(5, suburb#315, postcodes#316, sa2#317, long#2979, lat#2980)
         :                                                        :                                   :           :        :                 +- Project [locality#2977 AS suburb#315, postcode#2976 AS postcodes#316, SA2_MAINCODE_2016#2993 AS sa2#317, long#2979, lat#2980]
         :                                                        :                                   :           :        :                    +- Relation [id#2975,postcode#2976,locality#2977,state#2978,long#2979,lat#2980,dc#2981,type#2982,status#2983,sa3#2984,sa3name#2985,sa4#2986,sa4name#2987,region#2988,Lat_precise#2989,Long_precise#2990,SA1_MAINCODE_2011#2991,SA1_MAINCODE_2016#2992,SA2_MAINCODE_2016#2993,SA2_NAME_2016#2994,SA3_CODE_2016#2995,SA3_NAME_2016#2996,SA4_CODE_2016#2997,SA4_NAME_2016#2998,... 12 more fields] csv
         :                                                        :                                   :           :        +- LogicalRDD [SA2_code#3011L, SA2_name#3012, income_2018-2019#3013L, total_males#3014L, total_females#3015L, total_persons#3016L, state_code#3017L, state_name#3018, population_2020#3019L, population_2021#3020L], false
         :                                                        :                                   :           +- SubqueryAlias b
         :                                                        :                                   :              +- Relation [user_id#3021,order_datetime#3022,fraud_probability#3023] csv
         :                                                        :                                   +- SubqueryAlias b
         :                                                        :                                      +- Relation [merchant_abn#3024,order_datetime#3025,fraud_probability#3026] csv
         :                                                        +- SubqueryAlias tagged
         :                                                           +- View (`tagged`, [tagged_merchant_abn#1564L,category#3028])
         :                                                              +- Project [merchant_abn#3027L AS tagged_merchant_abn#1564L, category#3028]
         :                                                                 +- Relation [merchant_abn#3027L,category#3028] parquet
         +- SubqueryAlias rates
            +- View (`rates`, [drop_name#3840,take_rate#3842,revenue_levels#3844,category#3846,males_in_SA2#3848L,females_in_SA2#3850L,income_per_person#3852])
               +- Aggregate [merchant_name#193], [merchant_name#193 AS drop_name#3840, first(take_rate#175, false) AS take_rate#3842, first(revenue_levels#181, false) AS revenue_levels#3844, first(category#4096, false) AS category#3846, first(total_males#4082L, false) AS males_in_SA2#3848L, first(total_females#4083L, false) AS females_in_SA2#3850L, first(income_per_persons#3652, false) AS income_per_person#3852]
                  +- SubqueryAlias features
                     +- View (`features`, [merchant_name#193,take_rate#175,revenue_levels#181,state#4022,gender#4024,dollar_value#4028,user_id#4041L,suburb#315,postcodes#316,long#4047,lat#4048,SA2_code#4079L,income_2018_2019#3629L,total_males#4082L,total_females#4083L,total_persons#4084L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#4096,percent#1714,Year#1749,Month#1785,income_per_persons#3652])
                        +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#4022, gender#4024, dollar_value#4028, user_id#4041L, suburb#315, postcodes#316, long#4047, lat#4048, SA2_code#4079L, income_2018_2019#3629L, total_males#4082L, total_females#4083L, total_persons#4084L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#4096, percent#1714, Year#1749, Month#1785, (cast(income_2018_2019#3629L as double) / cast(total_persons#4084L as double)) AS income_per_persons#3652]
                           +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#4022, gender#4024, dollar_value#4028, user_id#4041L, suburb#315, postcodes#316, long#4047, lat#4048, SA2_code#4079L, income_2018-2019#4081L AS income_2018_2019#3629L, total_males#4082L, total_females#4083L, total_persons#4084L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#4096, percent#1714, Year#1749, Month#1785]
                              +- Project [merchant_name#193, take_rate#175, revenue_levels#181, state#4022, gender#4024, dollar_value#4028, user_id#4041L, suburb#315, postcodes#316, long#4047, lat#4048, SA2_code#4079L, income_2018-2019#4081L, total_males#4082L, total_females#4083L, total_persons#4084L, fraud_probability_consumer#1080, fraud_probability_merchant#1081, category#4096, percent#1714, Year#1749, Month#1785]
                                 +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 12 more fields]
                                    +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 11 more fields]
                                       +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 10 more fields]
                                          +- SubqueryAlias group
                                             +- View (`group`, [merchant_name#193,merchant_abn#4019L,categories#170,take_rate#175,revenue_levels#181,name#4020,address#4021,state#4022,gender#4024,trans_merchant_abn#214L,dollar_value#4028,order_id#4029,order_datetime#4030,user_id#4041L,consumer_id#4042L,suburb#315,postcodes#316,long#4047,lat#4048,int_sa2#566,SA2_code#4079L,SA2_name#4080,income_2018-2019#4081L,total_males#4082L,total_females#4083L,total_persons#4084L,state_code#4085L,state_name#4086,population_2020#4087L,population_2021#4088L,fraud_probability_consumer#1080,fraud_probability_merchant#1081,category#4096])
                                                +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 9 more fields]
                                                   +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 10 more fields]
                                                      +- Join Inner, (merchant_abn#4019L = tagged_merchant_abn#1564L)
                                                         :- SubqueryAlias join
                                                         :  +- View (`join`, [merchant_name#193,merchant_abn#4019L,categories#170,take_rate#175,revenue_levels#181,name#4020,address#4021,state#4022,gender#4024,trans_merchant_abn#214L,dollar_value#4028,order_id#4029,order_datetime#4030,user_id#4041L,consumer_id#4042L,suburb#315,postcodes#316,long#4047,lat#4048,int_sa2#566,SA2_code#4079L,SA2_name#4080,income_2018-2019#4081L,total_males#4082L,total_females#4083L,total_persons#4084L,state_code#4085L,state_name#4086,population_2020#4087L,population_2021#4088L,fraud_probability_consumer#1080,fraud_probability_merchant#1081])
                                                         :     +- Filter isnotnull(gender#4024)
                                                         :        +- Filter isnotnull(consumer_id#4042L)
                                                         :           +- Filter (dollar_value#4028 > cast(0 as double))
                                                         :              +- Filter isnotnull(merchant_abn#4019L)
                                                         :                 +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 8 more fields]
                                                         :                    +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 8 more fields]
                                                         :                       +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 8 more fields]
                                                         :                          +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 8 more fields]
                                                         :                             +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 8 more fields]
                                                         :                                +- Join LeftOuter, ((merchant_abn#4019L = cast(merchant_abn#4092 as bigint)) AND (order_datetime#4030 = cast(order_datetime#4093 as date)))
                                                         :                                   :- SubqueryAlias a
                                                         :                                   :  +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 7 more fields]
                                                         :                                   :     +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 7 more fields]
                                                         :                                   :        +- Join LeftOuter, ((user_id#4041L = cast(user_id#4089 as bigint)) AND (order_datetime#4030 = cast(order_datetime#4090 as date)))
                                                         :                                   :           :- SubqueryAlias a
                                                         :                                   :           :  +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566, SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, ... 6 more fields]
                                                         :                                   :           :     +- Join Inner, (cast(int_sa2#566 as bigint) = SA2_code#4079L)
                                                         :                                   :           :        :- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, long#4047, lat#4048, int_sa2#566]
                                                         :                                   :           :        :  +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, sa2#317, long#4047, lat#4048, cast(sa2#317 as int) AS int_sa2#566]
                                                         :                                   :           :        :     +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L, suburb#315, postcodes#316, sa2#317, long#4047, lat#4048]
                                                         :                                   :           :        :        +- Join Inner, (postcode#4023 = postcodes#316)
                                                         :                                   :           :        :           :- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, postcode#4023, gender#4024, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L]
                                                         :                                   :           :        :           :  +- Project [merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181, name#4020, address#4021, state#4022, postcode#4023, gender#4024, trans_user_id#220L, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030, user_id#4041L, consumer_id#4042L]
                                                         :                                   :           :        :           :     +- Join FullOuter, (merchant_abn#4019L = trans_merchant_abn#214L)
                                                         :                                   :           :        :           :        :- Project [name#4017 AS merchant_name#193, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181]
                                                         :                                   :           :        :           :        :  +- Project [name#4017, merchant_abn#4019L, categories#170, take_rate#175, revenue_levels#181]
                                                         :                                   :           :        :           :        :     +- Project [name#4017, tags#166, merchant_abn#4019L, categories#170, take_rate#175, split(tags#166, [)\]], [\[(], -1)[1] AS revenue_levels#181]
                                                         :                                   :           :        :           :        :        +- Project [name#4017, tags#166, merchant_abn#4019L, categories#170, cast(split(tags#166, [)\]], [\[(]take rate: , -1)[1] as double) AS take_rate#175]
                                                         :                                   :           :        :           :        :           +- Project [name#4017, tags#166, merchant_abn#4019L, split(tags#166, [)\]], [\[(], -1)[0] AS categories#170]
                                                         :                                   :           :        :           :        :              +- Project [name#4017, regexp_replace(tags#161, [\[(][\[(], , 1) AS tags#166, merchant_abn#4019L]
                                                         :                                   :           :        :           :        :                 +- Project [name#4017, regexp_replace(tags#4018, [\])][\])], , 1) AS tags#161, merchant_abn#4019L]
                                                         :                                   :           :        :           :        :                    +- Relation [name#4017,tags#4018,merchant_abn#4019L] parquet
                                                         :                                   :           :        :           :        +- Join Inner, (int_consumer_id#199L = consumer_id#4042L)
                                                         :                                   :           :        :           :           :- Project [name#4020, address#4021, state#4022, postcode#4023, gender#4024, int_consumer_id#199L]
                                                         :                                   :           :        :           :           :  +- Project [name#4020, address#4021, state#4022, postcode#4023, gender#4024, consumer_id#4025, cast(consumer_id#4025 as bigint) AS int_consumer_id#199L]
                                                         :                                   :           :        :           :           :     +- Relation [name#4020,address#4021,state#4022,postcode#4023,gender#4024,consumer_id#4025] csv
                                                         :                                   :           :        :           :           +- Join Inner, (trans_user_id#220L = user_id#4041L)
                                                         :                                   :           :        :           :              :- Project [user_id#4026L AS trans_user_id#220L, trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030]
                                                         :                                   :           :        :           :              :  +- Project [user_id#4026L, merchant_abn#4027L AS trans_merchant_abn#214L, dollar_value#4028, order_id#4029, order_datetime#4030]
                                                         :                                   :           :        :           :              :     +- Sort [order_datetime#4030 ASC NULLS FIRST], true
                                                         :                                   :           :        :           :              :        +- Deduplicate [order_id#4029, order_datetime#4030, user_id#4026L, dollar_value#4028, merchant_abn#4027L]
                                                         :                                   :           :        :           :              :           +- Union false, false
                                                         :                                   :           :        :           :              :              :- Deduplicate [order_id#4029, order_datetime#4030, user_id#4026L, dollar_value#4028, merchant_abn#4027L]
                                                         :                                   :           :        :           :              :              :  +- Union false, false
                                                         :                                   :           :        :           :              :              :     :- Relation [user_id#4026L,merchant_abn#4027L,dollar_value#4028,order_id#4029,order_datetime#4030] parquet
                                                         :                                   :           :        :           :              :              :     +- Relation [user_id#4031L,merchant_abn#4032L,dollar_value#4033,order_id#4034,order_datetime#4035] parquet
                                                         :                                   :           :        :           :              :              +- Relation [user_id#4036L,merchant_abn#4037L,dollar_value#4038,order_id#4039,order_datetime#4040] parquet
                                                         :                                   :           :        :           :              +- Relation [user_id#4041L,consumer_id#4042L] parquet
                                                         :                                   :           :        :           +- Deduplicate [postcodes#316]
                                                         :                                   :           :        :              +- Filter atleastnnonnulls(5, suburb#315, postcodes#316, sa2#317, long#4047, lat#4048)
                                                         :                                   :           :        :                 +- Project [locality#4045 AS suburb#315, postcode#4044 AS postcodes#316, SA2_MAINCODE_2016#4061 AS sa2#317, long#4047, lat#4048]
                                                         :                                   :           :        :                    +- Relation [id#4043,postcode#4044,locality#4045,state#4046,long#4047,lat#4048,dc#4049,type#4050,status#4051,sa3#4052,sa3name#4053,sa4#4054,sa4name#4055,region#4056,Lat_precise#4057,Long_precise#4058,SA1_MAINCODE_2011#4059,SA1_MAINCODE_2016#4060,SA2_MAINCODE_2016#4061,SA2_NAME_2016#4062,SA3_CODE_2016#4063,SA3_NAME_2016#4064,SA4_CODE_2016#4065,SA4_NAME_2016#4066,... 12 more fields] csv
                                                         :                                   :           :        +- LogicalRDD [SA2_code#4079L, SA2_name#4080, income_2018-2019#4081L, total_males#4082L, total_females#4083L, total_persons#4084L, state_code#4085L, state_name#4086, population_2020#4087L, population_2021#4088L], false
                                                         :                                   :           +- SubqueryAlias b
                                                         :                                   :              +- Relation [user_id#4089,order_datetime#4090,fraud_probability#4091] csv
                                                         :                                   +- SubqueryAlias b
                                                         :                                      +- Relation [merchant_abn#4092,order_datetime#4093,fraud_probability#4094] csv
                                                         +- SubqueryAlias tagged
                                                            +- View (`tagged`, [tagged_merchant_abn#1564L,category#4096])
                                                               +- Project [merchant_abn#4095L AS tagged_merchant_abn#1564L, category#4096]
                                                                  +- Relation [merchant_abn#4095L,category#4096] parquet


In [None]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'], handleInvalid="keep")

indexd_data = indexer.fit(predicting_data).transform(predicting_data)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'revenue_levels_vec','category_vec','males_in_SA2','females_in_SA2', 'income_per_person', 'no_of_customers','take_rate', 'total_earnings'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "future_customers",
    "label"
)


# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

                                                                                

In [None]:
predictions_test = model.transform(outdata1)

In [None]:
predictions_test.show(1)



22/10/05 12:35:11 WARN DAGScheduler: Broadcasting large task binary with size 1196.9 KiB
+--------------------+---------------+---------+----+-----+------------------+-----+-------+---------+--------------+---------+------------+--------------+-----------------+-----+-----------------+------------+--------+---------+------------------+------------+------------------+------------------+-------------+-------------+------------------+-------------+--------------------+--------------------+------------------+
|       merchant_name|no_of_customers| SA2_code|Year|Month|    total_earnings|males|females|take_rate|revenue_levels| category|males_in_SA2|females_in_SA2|income_per_person|label|merchant_name_num|SA2_code_num|Year_num|Month_num|revenue_levels_num|category_num| merchant_name_vec|      SA2_code_vec|     Year_vec|    Month_vec|revenue_levels_vec| category_vec|            features|     indexedFeatures|        prediction|
+--------------------+---------------+---------+----+-----+--------

                                                                                

In [None]:
predictions_test.createOrReplaceTempView("preds")

pred = spark.sql(""" 

SELECT merchant_name, ROUND(SUM(prediction)) AS total_future_customers
FROM preds
GROUP BY merchant_name

""")

pred.limit(5)

                                                                                

merchant_name,total_future_customers
Dictum Mi Incorpo...,8.0
Dictum Mi Limited,214.0
Donec Luctus Indu...,56.0
Elit Sed Consequa...,182.0
Hendrerit Consect...,36.0


In [None]:
pred.count()

                                                                                

1381

In [None]:
pred_df = pred.toPandas()

                                                                                

In [None]:
pred_df.to_csv("../data/curated/customers.csv")