In [1]:
import pandas as pd
from pyspark.sql import functions as F, SparkSession
import plotly.express as px

In [2]:
spark = (
    SparkSession.builder.appName("Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/07 13:46:34 WARN Utils: Your hostname, DESKTOP-3NQ3PQI resolves to a loopback address: 127.0.1.1; using 172.24.55.147 instead (on interface eth0)
22/09/07 13:46:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/07 13:46:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### consumers

In [None]:
consumer_ids_sdf = spark.read.parquet('../data/tables/consumer_user_details.parquet')
consumer_ids_df = consumer_ids_sdf.toPandas()
consumer_details_df = pd.read_csv('../data/tables/tbl_consumer.csv', delimiter="|")

In [7]:
consumer_ids_df.head(1)

Unnamed: 0,user_id,consumer_id
0,1,1195503


In [8]:
consumer_details_df.head(1)

Unnamed: 0,name,address,state,postcode,gender,consumer_id
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503


Each consumer id corresponds to a unique consumer. there are 4999999, unique consumers across the dataset
- 8 possible states
- 3167 possible postcodes across aus
- 3 possible genders

In [9]:
print(len(consumer_details_df))
consumer_details_df.nunique()

499999


name           221377
address        499955
state               8
postcode         3167
gender              3
consumer_id    499999
dtype: int64

### merchants

In [None]:
merchants_sdf = spark.read.parquet('../data/tables/tbl_merchants.parquet')
merchants_df = merchants_sdf.toPandas()

In [10]:
print(len(merchants_df))
merchants_df.head(1)

4026


Unnamed: 0,name,tags,merchant_abn
0,Felis Limited,"((furniture, home furnishings and equipment sh...",10023283211


Each merchant_abn corresponds to a unique merchant. There are 4026 unique merchants each with a unique name

In [11]:
print(len(merchants_df))
merchants_df.nunique()

4026


name            4026
tags            3954
merchant_abn    4026
dtype: int64

### transactions

Aggregated analysis

In [55]:
transactions_sdf = spark.read.parquet(
    '../data/tables/transactions_20210228_20210827_snapshot/'
).union(
    spark.read.parquet(
        '../data/tables/transactions_20210828_20220227_snapshot/'
    )
)

                                                                                

In [56]:
transactions_sdf.limit(1)

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20


In [57]:
# Top earning merchants on 2020-08-27
merchant_totals_sdf = transactions_sdf.groupBy(
    'merchant_abn'
    ).agg(
        F.sum('dollar_value').alias('total_revenue'), F.count('order_id').alias('num_payments')
    )

Revenue is determined by a combination of size of payments (how much a customer normally spends) and volume of payments (number of customers)<br><br>

In [58]:
# Top 5 merchants by revenue
merchant_totals_sdf.sort(F.col('total_revenue').desc()).limit(5)

                                                                                

merchant_abn,total_revenue,num_payments
79827781481,5654334.880610274,2795
27093785141,5651523.987263663,14947
39649557865,5644907.5891390415,12566
32709545238,5641259.252001086,7506
96680767841,5581215.628938407,17787


In [31]:
merchant_totals_sdf.sort(F.col('num_payments').desc()).limit(5)

                                                                                

merchant_abn,total_revenue,num_payments
24852446429,2227810.051741692,74153
86578477987,2451541.346494566,69812
64203420245,1930641.260570329,66927
49891706470,1840481.093793815,63590
46804135891,1800530.091160312,59983


In [50]:
transactions_sdf.sort(F.col('dollar_value').desc()).limit(5)

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
3101,91880575299,105193.88578925544,2ab65c8f-11b2-41c...,2021-04-17
11702,53918538787,77134.2146046811,8584d8d1-061b-41c...,2021-07-27
23700,11590404675,65660.03308418578,1d8f2322-d8c8-46c...,2021-08-26
18916,83199298021,59223.92657075801,61fe4fa3-f199-4a1...,2021-08-08
10447,82999039227,58176.53006300072,785f5625-5b2e-4f2...,2021-07-12


Time series analysis

In general revenue has increased steadily accross 2021

In [59]:
total_sdf = transactions_sdf.groupBy(
    'order_datetime'
).agg(
    F.sum(
        'dollar_value'
    ).alias(
        'total_revenue'
    )
).sort('order_datetime').withColumn(
    'dayofweek', F.dayofweek(
        F.to_date(
            F.col('order_datetime'), "yyyy-mm-dd"
        )
    )
)

total_df = total_sdf.toPandas()
px.line(total_sdf.toPandas(), x = 'order_datetime', y = 'total_revenue')

                                                                                

In [62]:
total_sdf = transactions_sdf.withColumn(
    'weekofyear', F.weekofyear(
        F.to_date(
            F.col('order_datetime'), "yyyy-mm-dd"
        )
    )
).groupBy(
    'weekofyear'
).agg(
    F.sum(
        'dollar_value'
    ).alias(
        'total_revenue'
    )
).sort('weekofyear')

total_df = total_sdf.toPandas()
px.line(total_sdf.toPandas(), x = 'weekofyear', y = 'total_revenue')

                                                                                

In [None]:
weekday_total_sdf = total_sdf.withColumn(
    'dayofweek', F.dayofweek(
        F.to_date(
            F.col('order_datetime'), "yyyy-mm-dd"
        )
    )
).groupBy(
    'dayofweek'
).agg(
    F.sum('total_revenue').alias('total_revenue')
).sort('dayofweek')


In [47]:
import calendar

weekday_total_df = weekday_total_sdf.toPandas()
weekday_total_df['dayofweek'] = weekday_total_df['dayofweek'].apply(lambda x : calendar.day_name[x-2])


px.line(weekday_total_df, x = 'dayofweek', y = 'total_revenue')

In [48]:
sample_abn_sdf = transactions_sdf.where(F.col('merchant_abn') == 24852446429).groupBy('order_datetime').agg(F.sum('dollar_value')).sort('order_datetime')
px.line(sample_abn_sdf.toPandas(), x = 'order_datetime', y = 'sum(dollar_value)')

                                                                                

In [51]:
sample_abn_sdf = transactions_sdf.where(F.col('merchant_abn') == 86578477987).groupBy('order_datetime').agg(F.sum('dollar_value')).sort('order_datetime')
px.line(sample_abn_sdf.toPandas(), x = 'order_datetime', y = 'sum(dollar_value)')

                                                                                