In [44]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
import lbl2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

In [45]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "10g")
    .getOrCreate()
)

22/10/03 22:18:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [46]:
# Read in data from ETL.py file
%run '../scripts/ETL.py' '../scripts/paths.json'
final_join3.limit(5)

22/10/03 22:18:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694


In [47]:
final_join3.count()

                                                                                

10540181

In [48]:
tagged_merchants = pd.read_csv("../data/curated/tagged_merchants.csv")

In [49]:
tagged_merchants = tagged_merchants.iloc[:,1:]

In [50]:
tagged_merchants.drop(['tags', 'name', 'cleaned_tags', 'store_type'], axis=1, inplace=True)

In [51]:
tagged_merchants.to_parquet("../data/curated/tagged_merchants.parquet")

In [52]:
tagged_merchants_sdf = spark.read.parquet("../data/curated/tagged_merchants.parquet")

In [53]:
tagged_merchants_sdf = tagged_merchants_sdf.withColumnRenamed('merchant_abn',

    'tagged_merchant_abn'
)

In [54]:
tagged_merchants_sdf.show(5)

+-------------------+--------------------+
|tagged_merchant_abn|            category|
+-------------------+--------------------+
|        10023283211|           Furniture|
|        10142254217|         Electronics|
|        10165489824|        Toys and DIY|
|        10187291046|        Toys and DIY|
|        10192359162|Books, Stationary...|
+-------------------+--------------------+
only showing top 5 rows



In [55]:
final_join3.createOrReplaceTempView("join")
tagged_merchants_sdf.createOrReplaceTempView("tagged")

joint = spark.sql(""" 

SELECT *
FROM join
LEFT JOIN tagged
ON join.merchant_abn = tagged.tagged_merchant_abn
""")



In [43]:
joint = joint.drop('tagged_merchant_abn')

                                                                                

merchant_name,merchant_abn,categories,take_rate,revenue_levels,name,address,state,gender,trans_merchant_abn,dollar_value,order_id,order_datetime,user_id,consumer_id,postcodes,int_sa2,SA2_code,SA2_name,income_2018-2019,total_males,total_females,total_persons,state_code,state_name,population_2020,population_2021,category
Egestas Nunc Asso...,11121775571,digital goods: bo...,6.58,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,11121775571,11.28829564583802,2bd2a61d-72e5-42d...,2021-08-20,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Books, Stationary..."
Morbi Accumsan In...,19618998054,tent and aWning s...,1.52,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,19618998054,62.90176609196828,3582b1f8-4577-403...,2021-05-16,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Books, Stationary..."
Eu Dolor Egestas PC,94472466107,"cable, satellite,...",6.23,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,94472466107,172.15375126873164,cb05d49f-c2fa-453...,2021-07-22,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,Electronics
Urna Justo Indust...,31472801314,music shops - mus...,6.56,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,31472801314,0.4894787650356477,aeec15c1-67e8-4cb...,2021-05-18,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Books, Stationary..."
Eu Sem Pellentesq...,35424691626,"computers, comput...",3.9,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,35424691626,7.360217018778133,9df473ba-102d-461...,2021-07-04,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,Electronics
Eu Inc.,42355028515,lawn and garden s...,5.97,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,42355028515,76.8823425915479,274dfcce-a369-46c...,2021-03-14,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Beauty, Health, P..."
Vitae Posuere Ind...,27500085887,"watch, clock, and...",4.85,b,Christopher Rodri...,30554 Evans Strea...,NSW,Male,27500085887,116.34204900204628,fc7445b7-3bec-4e4...,2021-04-08,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,Toys and DIY
At Velit Cras Ass...,93894215354,"opticians, optica...",2.18,c,Christopher Rodri...,30554 Evans Strea...,NSW,Male,93894215354,742.8522684991789,aa4f46ec-abd0-400...,2021-11-27,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Books, Stationary..."
Vestibulum Accums...,86662713230,"watch, clock, aNd...",6.41,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,86662713230,26.15343687898328,d385229b-ba64-416...,2021-11-27,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,Toys and DIY
Est Nunc Consulting,89726005175,tent and awning s...,6.01,a,Christopher Rodri...,30554 Evans Strea...,NSW,Male,89726005175,25.096011934666333,5b17ed01-73ca-460...,2021-12-03,3698,1175,2299,111031231,111031231,Shortland - Jesmond,242936885,6412,6179,12593,1,New South Wales,12598,12694,"Books, Stationary..."


In [30]:
joint.count()

                                                                                

10540181

In [56]:
joint.createOrReplaceTempView("group")

a = spark.sql(""" 

SELECT *, (dollar_value - take_rate) AS total_earning
FROM group
""")


In [57]:
a.createOrReplaceTempView("group_earnings")

b = spark.sql(""" 

SELECT merchant_name, SUM(total_earning) AS total_revenue, COUNT(merchant_abn) AS no_of_transactions
FROM group_earnings
GROUP BY merchant_name
""")

b.show()



+--------------------+------------------+------------------+
|       merchant_name|     total_revenue|no_of_transactions|
+--------------------+------------------+------------------+
|   Dictum Mi Limited| 897687.9661494899|              9880|
|Volutpat Nulla In...| 836578.8864434501|             10561|
|Mollis Integer Co...|384905.39521080774|             12234|
|Hendrerit Consect...|239274.53116224083|              3010|
|Donec Luctus Indu...|  1063577.03533018|              3113|
|Varius Orci Insti...|147376.27431885863|             19771|
|Elit Sed Consequa...| 5724004.642736054|              9689|
|Dictum Mi Incorpo...| 775536.8639499726|              1236|
|At Augue Corporation|125257.92718248101|              1544|
|      Montes Limited|429378.17366086063|               673|
|     Erat Semper Ltd| 639325.5782926289|              1781|
| Semper Incorporated| 92248.05999937073|               926|
|Porttitor Eros In...|121304.87964357901|              1262|
|   Malesuada Vel Ltd| 1

                                                                                

In [58]:
b.limit(5)

                                                                                

merchant_name,total_revenue,no_of_transactions
Dictum Mi Limited,897687.9661494899,9880
Volutpat Nulla In...,836578.8864434501,10561
Mollis Integer Co...,384905.3952108077,12234
Hendrerit Consect...,239274.53116224083,3010
Donec Luctus Indu...,1063577.03533018,3113
