In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

# Read the given datasets

In [30]:
ori_transaction = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot') #read the parquet 
ori_transaction

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20


In [29]:
o = ori_transaction.filter(F.col('dollar_value').isNull())
o


user_id,merchant_abn,dollar_value,order_id,order_datetime


In [19]:
consumer_detail = spark.read.parquet('../data/tables/consumer_user_details.parquet')
consumer_detail

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


In [32]:
c = consumer_detail.filter(F.col('consumer_id').isNull())
u = consumer_detail.filter(F.col('user_id').isNull())
c.show()
u.show()

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
+-------+-----------+

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
+-------+-----------+



In [20]:
tbl_merchants = spark.read.parquet('../data/tables/tbl_merchants.parquet')
tbl_merchants

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162
Fusce Company,"[(gift, card, nov...",10206519221
Aliquam Enim Inco...,"[(computers, comP...",10255988167
Ipsum Primis Ltd,"[[watch, clock, a...",10264435225
Pede Ultrices Ind...,([computer progra...,10279061213
Nunc Inc.,"[(furniture, home...",10323485998


In [37]:
import pandas as pd
tbl_consumer = spark.read.option("delimiter", "|").option("header",True).csv("../data/tables/tbl_consumer.csv")
tbl_consumer.count()

499999

In [38]:
n = tbl_consumer.na.drop(subset=["name","address","state","postcode","gender","consumer_id"])
n.count()

499999

## Group by the user id and merchant abn, add average, sum and count columns

In [22]:
grouped_transaction = ori_transaction.groupBy('merchant_abn').agg(F.avg('dollar_value').alias('avg'), F.sum('dollar_value').alias('sum'), F.count('dollar_value').alias('count')).sort('merchant_abn')
grouped_transaction.drop(F.col('order_id'))
grouped_transaction

merchant_abn,avg,sum,count
10023283211,208.7505944404296,172219.24041335442,825
10142254217,40.03977013273878,30870.6627723416,771
10187291046,110.32619101999413,9708.704809759483,88
10192359162,451.1432080236007,48272.32325852528,107
10206519221,38.23439165412656,92986.04050283578,2432
10255988167,389.5552654520502,84923.04786854694,218
10264435225,114.09004324213883,145464.80513372703,1275
10279061213,312.3494031465133,40917.77181219324,131
10323485998,128.96235067166586,339042.0199158096,2629
10342410215,378.0169713940928,89968.03919179407,238


In [23]:
grouped_transaction_pd = grouped_transaction.toPandas()
tbl_merchants_pd = tbl_merchants.toPandas()
merchant = pd.merge(grouped_transaction_pd, tbl_merchants_pd)
merchant

Unnamed: 0,merchant_abn,avg,sum,count,name,tags
0,10023283211,208.750594,172219.240413,825,Felis Limited,"((furniture, home furnishings and equipment sh..."
1,10142254217,40.039770,30870.662772,771,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a..."
2,10187291046,110.326191,9708.704810,88,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops], [b]..."
3,10192359162,451.143208,48272.323259,107,Enim Condimentum PC,"([music shops - musical instruments, pianos, a..."
4,10206519221,38.234392,92986.040503,2432,Fusce Company,"[(gift, card, novelty, and souvenir shops), (a..."
...,...,...,...,...,...,...
3976,99938978285,30.029723,132971.614185,4428,Elit Dictum Eu Ltd,"[(opticians, optical goods, and eyeglasses), (..."
3977,99974311662,319.209608,10533.917066,33,Mollis LLP,"((books, periodicals, and newspapers), (b), (t..."
3978,99976658299,150.166184,907904.749365,6046,Sociosqu Corp.,"((shoe shops), (a), (take rate: 6.57))"
3979,99987905597,353.519570,15908.380664,45,Commodo Hendrerit LLC,"[[motor vehicle Supplies and new parts], [a], ..."


In [24]:
for i in range(int(merchant['tags'].count())):
    merchant['tags'].iloc[i] = merchant['tags'].iloc[i].replace(r'[', r'(').replace(r']', r')')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merchant['tags'].iloc[i] = merchant['tags'].iloc[i].replace(r'[', r'(').replace(r']', r')')


In [25]:
#merchant['tags'].str.split(pat, n=-1, expand=False)
# to split into multiple columns by delimiter
merchant_tags = merchant['tags'].str.split(')', expand=True)

In [26]:
for row in range(int(len(merchant_tags))):
    for col in range(3):
        merchant_tags.iloc[row,col] = merchant_tags.iloc[row,col].replace(r'((', r'').replace(r', (', r'').replace(r'take rate:', r'')
merchant_tags.rename(columns = {0 : 'Store type', 1 : 'Revenue levels', 2 : 'Take rate'}, inplace = True)
merchant_tags = merchant_tags[['Store type', 'Revenue levels', 'Take rate']]

In [27]:
merchant[['Store type', 'Revenue levels', 'Take rate']] = merchant_tags[['Store type', 'Revenue levels', 'Take rate']]
merchant.drop(columns=['tags'])

Unnamed: 0,merchant_abn,avg,sum,count,name,Store type,Revenue levels,Take rate
0,10023283211,208.750594,172219.240413,825,Felis Limited,"furniture, home furnishings and equipment shop...",e,0.18
1,10142254217,40.039770,30870.662772,771,Arcu Ac Orci Corporation,"cable, satellite, and otHer pay television and...",b,4.22
2,10187291046,110.326191,9708.704810,88,Ultricies Dignissim Lacus Foundation,"wAtch, clock, and jewelry repair shops",b,3.29
3,10192359162,451.143208,48272.323259,107,Enim Condimentum PC,"music shops - musical instruments, pianos, and...",a,6.33
4,10206519221,38.234392,92986.040503,2432,Fusce Company,"gift, card, novelty, and souvenir shops",a,6.34
...,...,...,...,...,...,...,...,...
3976,99938978285,30.029723,132971.614185,4428,Elit Dictum Eu Ltd,"opticians, optical goods, and eyeglasses",b,4.50
3977,99974311662,319.209608,10533.917066,33,Mollis LLP,"books, periodicals, and newspapers",b,3.17
3978,99976658299,150.166184,907904.749365,6046,Sociosqu Corp.,shoe shops,a,6.57
3979,99987905597,353.519570,15908.380664,45,Commodo Hendrerit LLC,motor vehicle Supplies and new parts,a,6.82


In [28]:
merchant.filter(F.col('avg').isNull()

SyntaxError: unexpected EOF while parsing (119740809.py, line 1)