In [65]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import lbl2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from pyspark.sql.functions import date_format
import statsmodels.api as sm
from statsmodels.formula.api import ols
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


In [66]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

22/10/04 11:15:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [67]:
# Read in data from ETL.py file
%run '../scripts/ETL.py' '../scripts/paths.json'
final_join3.limit(5)

22/10/04 11:15:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694


In [68]:
final_join3.count()

                                                                                

10540181

In [69]:
tagged_merchants = pd.read_csv("../data/curated/tagged_merchants.csv")
tagged_merchants = tagged_merchants.iloc[:,1:]
tagged_merchants.drop(['tags', 'name', 'cleaned_tags', 'store_type'], axis=1, inplace=True)
tagged_merchants.to_parquet("../data/curated/tagged_merchants.parquet")
tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [70]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [71]:
tagged_merchants_sdf.show(5)

+-------------------+--------------------+
|tagged_merchant_abn|            category|
+-------------------+--------------------+
|        10023283211|           Furniture|
|        10142254217|         Electronics|
|        10165489824|        Toys and DIY|
|        10187291046|        Toys and DIY|
|        10192359162|Books, Stationary...|
+-------------------+--------------------+
only showing top 5 rows



In [72]:
final_join3.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
INNER JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")

joint = joint.drop('tagged_merchant_abn')

In [73]:
joint.count()

                                                                                

10109254

In [74]:
joint.createOrReplaceTempView("group")

a = spark.sql(""" 

SELECT *
FROM group
""")

In [75]:
# Extracting the year, month, day from the timestamp

a = a.withColumn("Year", 
date_format('order_datetime', 'yyyy'))

a  = a.withColumn("Month", 
date_format('order_datetime', 'MMMM'))


a = a.withColumn("Day",
date_format(("order_datetime"), "E"))


In [76]:
a.createOrReplaceTempView("agg")

pop_2021 = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS pop_name, FIRST(population_2020) AS population
FROM agg
WHERE Year = '2021'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")

pop_2021.show(5)

pop_2022 = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS pop_name, FIRST(population_2021) AS population
FROM agg
WHERE Year = '2022'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")
pop_2022.show(5)

                                                                                

+--------------------+----------+
|            pop_name|population|
+--------------------+----------+
|Metus Sit Amet In...|      6249|
|Ut Nisi Limited30...|      5126|
|Dolor Dolor Indus...|     21497|
|Ut Molestie Found...|      8321|
|Vivamus Sit LLC30...|      5126|
+--------------------+----------+
only showing top 5 rows





+--------------------+----------+
|            pop_name|population|
+--------------------+----------+
|Suspendisse Non L...|      5114|
|Dignissim Maecena...|      9771|
|Malesuada Vel Con...|      2172|
|Mollis Duis Sit F...|      9980|
|Placerat Eget Ven...|     15239|
+--------------------+----------+
only showing top 5 rows



                                                                                

In [77]:
a = a.drop('merchant_abn', 'categories','name', 'address', 'trans_merchant_abn', 'order_id','order_datetime','user_id','consumer_id','int_sa2',
'SA2_name','state_code','state_name','population_2020', 'population_2021')

In [78]:
 
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
a.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in a.columns]
   ).show()



+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+
|merchant_name|take_rate|revenue_levels|state|gender|dollar_value|postcodes|SA2_code|income_2018-2019|total_males|total_females|total_persons|category|Year|Month|Day|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+
|            0|        0|             0|    0|     0|           0|        0|       0|               0|          0|            0|            0|       0|   0|    0|  0|
+-------------+---------+--------------+-----+------+------------+---------+--------+----------------+-----------+-------------+-------------+--------+----+-----+---+



                                                                                

In [79]:
a.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- postcodes: string (nullable = true)
 |-- SA2_code: long (nullable = true)
 |-- income_2018-2019: long (nullable = true)
 |-- total_males: long (nullable = true)
 |-- total_females: long (nullable = true)
 |-- total_persons: long (nullable = true)
 |-- category: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)



In [80]:
a.createOrReplaceTempView("agg")

male = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS m_name, COUNT(gender) as males
FROM agg
WHERE gender = 'Male'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")

male.show(5)

female = spark.sql(""" 

SELECT CONCAT(merchant_name, SA2_code, Year, Month, Day) AS f_name, COUNT(gender) as females
FROM agg
WHERE gender = 'Female'
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")
female.show(5)

                                                                                

+--------------------+-----+
|              m_name|males|
+--------------------+-----+
|Pede Nonummy Corp...|    2|
|Suspendisse Dui C...|    1|
|Maecenas Industri...|    1|
|Lorem Ipsum Sodal...|    1|
|Ultricies Digniss...|    1|
+--------------------+-----+
only showing top 5 rows





+--------------------+-------+
|              f_name|females|
+--------------------+-------+
|Eget Company10902...|      2|
|Taciti PC10902117...|      1|
|Est Nunc Consulti...|      1|
|Eget Metus In Cor...|      1|
|Cras Eget Foundat...|      1|
+--------------------+-------+
only showing top 5 rows



                                                                                

In [81]:
a.show(2)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018-2019|total_males|total_females|total_persons|            category|Year| Month|Day|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|August|Fri|
|Morbi Accumsan In...|     1.52|             c|  NSW|  Male|62.90176609196828|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|   May|Sun|
+--------------------+---------+---

                                                                                

In [82]:
a.createOrReplaceTempView("agg")

temp = spark.sql(""" 

SELECT merchant_name, COUNT(merchant_name) AS no_of_transactions, SA2_code, Year, Month, Day, SUM(dollar_value) AS total_income,
    CONCAT(merchant_name, SA2_code, Year, Month, Day) AS join_col
FROM agg
GROUP BY merchant_name, SA2_code, Year, Month, Day
""")

temp.show()


[Stage 2073:>                                                       (0 + 1) / 1]

+--------------------+------------------+---------+----+------+---+------------------+--------------------+
|       merchant_name|no_of_transactions| SA2_code|Year| Month|Day|      total_income|            join_col|
+--------------------+------------------+---------+----+------+---+------------------+--------------------+
|    Euismod Enim LLC|                 1|210021234|2021|August|Fri|23.766022223176805|Euismod Enim LLC2...|
|Tempus Scelerisqu...|                 1|510021267|2021|August|Sun|11.373472011562347|Tempus Scelerisqu...|
|  Aliquam Gravida PC|                 1|315021404|2021|August|Mon| 459.8031699416258|Aliquam Gravida P...|
|  Pede Nonummy Corp.|                 7|315031410|2021|  July|Fri|  143.656026584652|Pede Nonummy Corp...|
|       Vel Institute|                 1|305031120|2021|   May|Fri| 84.56982384549502|Vel Institute3050...|
|Sed Neque Associates|                 1|315031408|2021|  July|Fri| 549.9754130974492|Sed Neque Associa...|
|              Eu LLC|      

                                                                                

In [83]:
pop_2022.limit(5)

                                                                                

pop_name,population
Suspendisse Non L...,5114
Dignissim Maecena...,9771
Malesuada Vel Con...,2172
Mollis Duis Sit F...,9980
Placerat Eget Ven...,15239


In [86]:
temp.createOrReplaceTempView("pop_join")
pop_2021.createOrReplaceTempView("pop2021")

pop2 = spark.sql(""" 

SELECT *
FROM pop_join
INNER JOIN pop2021
ON pop_join.join_col = pop2021.pop_name
WHERE pop_join.Year == '2021'
""")

pop2.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,join_col,pop_name,population
A Aliquet Ltd,1,111011206,2021,August,Sun,144.92626003254605,A Aliquet Ltd1110...,A Aliquet Ltd1110...,17986
A Aliquet Ltd,1,119011358,2021,March,Fri,170.85311633083847,A Aliquet Ltd1190...,A Aliquet Ltd1190...,17359
A Aliquet Ltd,1,119011572,2021,September,Mon,189.1723204191432,A Aliquet Ltd1190...,A Aliquet Ltd1190...,18209
A Aliquet Ltd,1,119021362,2021,December,Sun,108.62765682595963,A Aliquet Ltd1190...,A Aliquet Ltd1190...,20616
A Aliquet Ltd,1,120031393,2021,March,Tue,313.17032618991084,A Aliquet Ltd1200...,A Aliquet Ltd1200...,17780


In [93]:
temp.createOrReplaceTempView("pop_join")
pop_2022.createOrReplaceTempView("pop2022")

pop3 = spark.sql(""" 

SELECT *
FROM pop_join
INNER JOIN pop2022
ON pop_join.join_col = pop2022.pop_name
WHERE pop_join.Year == '2022'
""")

pop3.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,join_col,pop_name,population
A Aliquet Ltd,1,111021217,2022,August,Wed,141.8509610912289,A Aliquet Ltd1110...,A Aliquet Ltd1110...,16456
A Aliquet Ltd,1,111031232,2022,May,Mon,105.23461363171435,A Aliquet Ltd1110...,A Aliquet Ltd1110...,8809
A Aliquet Ltd,1,119011572,2022,September,Tue,462.94946210748094,A Aliquet Ltd1190...,A Aliquet Ltd1190...,18500
A Aliquet Ltd,1,121021406,2022,July,Sun,215.2516865063816,A Aliquet Ltd1210...,A Aliquet Ltd1210...,19199
A Aliquet Ltd,1,201011002,2022,May,Fri,345.51374065819294,A Aliquet Ltd2010...,A Aliquet Ltd2010...,12076


In [94]:
total_pop = pop2.union(pop3)

In [95]:
total_pop.limit(3)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,join_col,pop_name,population
A Aliquet Ltd,1,111011206,2021,August,Sun,144.92626003254605,A Aliquet Ltd1110...,A Aliquet Ltd1110...,17986
A Aliquet Ltd,1,119011358,2021,March,Fri,170.85311633083847,A Aliquet Ltd1190...,A Aliquet Ltd1190...,17359
A Aliquet Ltd,1,119011572,2021,September,Mon,189.1723204191432,A Aliquet Ltd1190...,A Aliquet Ltd1190...,18209


In [96]:
total_pop.createOrReplaceTempView("check")

g = spark.sql(""" 

SELECT DISTINCT(Year)
FROM check

""")

g.show()



+----+
|Year|
+----+
|2021|
|2022|
+----+



                                                                                

In [97]:
temp.createOrReplaceTempView("gender_join")
male.createOrReplaceTempView("m")
female.createOrReplaceTempView("f")

temp2 = spark.sql(""" 

SELECT *
FROM gender_join
INNER JOIN m
ON gender_join.join_col = m.m_name
""")

temp2.createOrReplaceTempView("temp2")

temp3 = spark.sql(""" 

SELECT *
FROM temp2
INNER JOIN f
ON temp2.join_col = f.f_name
""")

temp3.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,join_col,m_name,males,f_name,females
A Auctor Non Corp...,2,205031089,2022,August,Wed,165.03443648170074,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,2,211051282,2022,October,Wed,121.21960880156465,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,2,303041070,2022,May,Mon,47.254634232186206,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1
A Auctor Non Corp...,3,305011105,2022,January,Sat,353.2219329254256,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,2
A Auctor Non Corp...,2,312021353,2022,August,Tue,122.3548736749258,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1


In [99]:
total_pop = total_pop.drop('merchant_name', 'no_of_transactions', 'SA2_code', 'Year', 'Month', 'Day', 'total_income', 'pop_name')

In [100]:
temp3.createOrReplaceTempView("join_gender_pop")
total_pop.createOrReplaceTempView("pop_view")

f = spark.sql(""" 

SELECT *
FROM join_gender_pop
INNER JOIN pop_view
ON join_gender_pop.join_col = pop_view.join_col

""")

f.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,join_col,m_name,males,f_name,females,join_col.1,population
A Auctor Non Corp...,2,205031089,2022,August,Wed,165.03443648170074,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1,A Auctor Non Corp...,9783
A Auctor Non Corp...,2,211051282,2022,October,Wed,121.21960880156465,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1,A Auctor Non Corp...,9771
A Auctor Non Corp...,2,303041070,2022,May,Mon,47.254634232186206,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1,A Auctor Non Corp...,7679
A Auctor Non Corp...,3,305011105,2022,January,Sat,353.2219329254256,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,2,A Auctor Non Corp...,13779
A Auctor Non Corp...,2,312021353,2022,August,Tue,122.3548736749258,A Auctor Non Corp...,A Auctor Non Corp...,1,A Auctor Non Corp...,1,A Auctor Non Corp...,8245


In [101]:
a = a.withColumnRenamed('income_2018-2019',

    'income_2018_2019'    
)

a = a.withColumn('income_per_persons',
    (F.col('income_2018_2019')/F.col('total_persons'))
)


In [78]:
a.show(1)



+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+------------------+
|       merchant_name|take_rate|revenue_levels|state|gender|     dollar_value|postcodes| SA2_code|income_2018_2019|total_males|total_females|total_persons|            category|Year| Month|Day|income_per_persons|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-------------+--------------------+----+------+---+------------------+
|Egestas Nunc Asso...|     6.58|             a|  NSW|  Male|11.28829564583802|     2299|111031231|       242936885|       6412|         6179|        12593|Books, Stationary...|2021|August|Fri|19291.422615738902|
+--------------------+---------+--------------+-----+------+-----------------+---------+---------+----------------+-----------+-------------+-----------

                                                                                

In [102]:
a.createOrReplaceTempView("features")

e = spark.sql(""" 

SELECT merchant_name AS drop_name, FIRST(take_rate) AS take_rate, FIRST(revenue_levels) AS revenue_levels, FIRST(category) AS category,
    FIRST(total_males) AS males_in_SA2, FIRST(total_females) AS females_in_SA2, FIRST(income_per_persons) AS income_per_person
FROM features
GROUP BY merchant_name
""")

e.show(2)



+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|      drop_name|take_rate|revenue_levels|            category|males_in_SA2|females_in_SA2| income_per_person|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
|   A Associates|     4.95|             b|Books, Stationary...|        9762|         10846|22526.523772559674|
|A Felis Company|     4.32|             b|Books, Stationary...|        1080|          1051| 33927.61168708765|
+---------------+---------+--------------+--------------------+------------+--------------+------------------+
only showing top 2 rows



                                                                                

In [None]:
f.createOrReplaceTempView("edit")
e.createOrReplaceTempView("rates")

temp5 = spark.sql(""" 

SELECT *
FROM edit
INNER JOIN rates
ON edit.merchant_name = rates.drop_name
""")



In [107]:
train = temp5.drop('m_name', 'f_name', 'drop_name', 'join_col')

train.limit(5)

                                                                                

merchant_name,no_of_transactions,SA2_code,Year,Month,Day,total_income,males,females,population,take_rate,revenue_levels,category,males_in_SA2,females_in_SA2,income_per_person
A Auctor Non Corp...,2,205031089,2022,August,Wed,165.03443648170074,1,1,9783,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,211051282,2022,October,Wed,121.21960880156465,1,1,9771,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,303041070,2022,May,Mon,47.254634232186206,1,1,7679,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,3,305011105,2022,January,Sat,353.2219329254256,1,2,13779,5.58,a,Furniture,2067,2014,22634.72370679088
A Auctor Non Corp...,2,312021353,2022,August,Tue,122.3548736749258,1,1,8245,5.58,a,Furniture,2067,2014,22634.72370679088


In [112]:
train.createOrReplaceTempView("h")

j = spark.sql(""" 

SELECT DISTINCT(Year)
FROM h

""")

j.show()



+----+
|Year|
+----+
|2022|
+----+



                                                                                

In [108]:
train.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- no_of_transactions: long (nullable = false)
 |-- SA2_code: long (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- total_income: double (nullable = true)
 |-- males: long (nullable = false)
 |-- females: long (nullable = false)
 |-- population: long (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- revenue_levels: string (nullable = true)
 |-- category: string (nullable = true)
 |-- males_in_SA2: long (nullable = true)
 |-- females_in_SA2: long (nullable = true)
 |-- income_per_person: double (nullable = true)



In [110]:
train.count()

                                                                                

254478

In [111]:
# String indexing the categorical columns

indexer = StringIndexer(inputCols = ['merchant_name', 'SA2_code', 'Year', 'Month', 'Day', 'revenue_levels','category'],
outputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'Day_num', 'revenue_levels_num','category_num'])

indexd_data = indexer.fit(train).transform(train)


# Applying onehot encoding to the categorical data that is string indexed above
encoder = OneHotEncoder(inputCols = ['merchant_name_num', 'SA2_code_num', 'Year_num', 'Month_num', 'Day_num', 'revenue_levels_num','category_num'],
outputCols = ['merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'Day_vec', 'revenue_levels_vec','category_vec'])

onehotdata = encoder.fit(indexd_data).transform(indexd_data)


# Assembling the training data as a vector of features 
assembler1 = VectorAssembler(
inputCols=['population','total_income','take_rate', 'merchant_name_vec', 'SA2_code_vec', 'Year_vec', 'Month_vec', 'Day_vec', 'revenue_levels_vec','category_vec','males_in_SA2', 'females_in_SA2', 'income_per_person'],
outputCol= "features" )

outdata1 = assembler1.transform(onehotdata)

                                                                                

IllegalArgumentException: requirement failed: The input column Year_num should have at least two distinct values.

In [None]:
# Renaming the target column as label

outdata1 = outdata1.withColumnRenamed(
    "no_of_transactions",
    "label"
)

In [None]:
# Assembling the features as a feature vector 

featureIndexer =\
    VectorIndexer(inputCol="features", 
    outputCol="indexedFeatures").fit(outdata1)

outdata1 = featureIndexer.transform(outdata1)

[Stage 2126:>(0 + 8) / 14][Stage 2127:>(0 + 0) / 14][Stage 2128:>(0 + 0) / 14]  

22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:54:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [86]:
# Split the data into training and validation sets (30% held out for testing)

trainingData, testData = outdata1.randomSplit([0.7, 0.3], seed = 20)

In [87]:
trainingData.count(), testData.count()

[Stage 2170:>(0 + 8) / 14][Stage 2171:>(0 + 0) / 14][Stage 2172:>(0 + 0) / 14]  

22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 2170:>(8 + 6) / 14][Stage 2171:>(0 + 2) / 14][Stage 2172:>(0 + 0) / 14]

22/10/04 09:55:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 09:55:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/04 09:56:08 WARN DAGScheduler: Broadcasting large task binary with size 1309.9 KiB




22/10/04 09:56:53 WARN DAGScheduler: Broadcasting large task binary with size 1309.9 KiB


                                                                                

(355294, 152684)

In [88]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")


# Train model.  
model = rf.fit(trainingData)

# Make predictions.
predictions_validation = model.transform(testData)

[Stage 2317:>                                                       (0 + 8) / 9]

22/10/04 09:57:38 WARN DAGScheduler: Broadcasting large task binary with size 1316.3 KiB


                                                                                

22/10/04 09:57:39 WARN DAGScheduler: Broadcasting large task binary with size 1316.4 KiB


                                                                                

22/10/04 09:57:47 WARN DAGScheduler: Broadcasting large task binary with size 1320.4 KiB


                                                                                

22/10/04 09:57:53 WARN DAGScheduler: Broadcasting large task binary with size 1472.7 KiB


[Stage 2362:>                                                      (0 + 8) / 13]

22/10/04 09:57:57 WARN MemoryStore: Not enough space to cache rdd_5478_2 in memory! (computed 255.3 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_5 in memory! (computed 168.8 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_4 in memory! (computed 74.8 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_1 in memory! (computed 112.5 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_6 in memory! (computed 255.3 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_7 in memory! (computed 168.8 MiB so far)
22/10/04 09:57:58 WARN MemoryStore: Not enough space to cache rdd_5478_3 in memory! (computed 255.3 MiB so far)
22/10/04 09:57:58 WARN BlockManager: Persisting block rdd_5478_4 to disk instead.
22/10/04 09:57:58 WARN BlockManager: Persisting block rdd_5478_2 to disk instead.
22/10/04 09:57:58 WARN BlockManager: Persisting block

                                                                                

22/10/04 09:58:10 WARN DAGScheduler: Broadcasting large task binary with size 1554.2 KiB
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_0 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_5 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_7 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_3 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_4 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_6 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:10 WARN MemoryStore: Not enough space to cache rdd_5478_1 in memory! (computed 49.0 MiB so far)


                                                                                

22/10/04 09:58:17 WARN DAGScheduler: Broadcasting large task binary with size 1707.3 KiB
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_4 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_5 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_3 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_6 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_7 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_1 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:17 WARN MemoryStore: Not enough space to cache rdd_5478_0 in memory! (computed 49.0 MiB so far)


                                                                                

22/10/04 09:58:24 WARN DAGScheduler: Broadcasting large task binary with size 1963.9 KiB
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_1 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_6 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_5 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_4 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_7 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:24 WARN MemoryStore: Not enough space to cache rdd_5478_3 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:25 WARN MemoryStore: Not enough space to cache rdd_5478_0 in memory! (computed 21.2 MiB so far)


                                                                                

22/10/04 09:58:32 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_6 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_5 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_7 in memory! (computed 49.0 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_0 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_3 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_4 in memory! (computed 32.2 MiB so far)
22/10/04 09:58:32 WARN MemoryStore: Not enough space to cache rdd_5478_1 in memory! (computed 32.2 MiB so far)


                                                                                

In [89]:
# Evaluate the validation set 

predictions_validation.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error

evaluator_train_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse_train = evaluator_train_rmse.evaluate(predictions_validation)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_train)

evaluator_train_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae_train = evaluator_train_mae.evaluate(predictions_validation)
print("Root Mean Squared Error (MAE) on train data = %g" % mae_train)



22/10/04 09:59:16 WARN DAGScheduler: Broadcasting large task binary with size 1324.3 KiB


                                                                                

+-----------------+------------------+--------------------+
|       prediction|             label|            features|
+-----------------+------------------+--------------------+
|228.3845872373052|110.05960880156465|(2835,[0,1,222,17...|
|228.3845872373052| 27.13087366041648|(2835,[0,1,222,17...|
|228.3845872373052| 257.3289665843789|(2835,[0,1,222,17...|
|228.3845872373052|122.97252883408981|(2835,[0,1,222,17...|
| 202.086279904091|417.99304299678636|(2835,[0,1,490,21...|
+-----------------+------------------+--------------------+
only showing top 5 rows



[Stage 2508:>                                                       (0 + 8) / 9]

22/10/04 09:59:54 WARN DAGScheduler: Broadcasting large task binary with size 1321.7 KiB




22/10/04 10:00:02 WARN DAGScheduler: Broadcasting large task binary with size 1322.8 KiB
Root Mean Squared Error (RMSE) on train data = 223.326




22/10/04 10:00:38 WARN DAGScheduler: Broadcasting large task binary with size 1321.6 KiB




22/10/04 10:00:45 WARN DAGScheduler: Broadcasting large task binary with size 1322.7 KiB
Root Mean Squared Error (MAE) on raint data = 120.714


                                                                                