### Analyse Transaction by SA2
1. Merge SA2 to transaction by postcode
2. Check how many unique SA2
3. Check for null SA2 values
4. Per SA2 aggregate: total_population, median age, the transaction average dollar amount (AOV), transaction frequency, and the number of unique customers, use BPNL % (num_unique_cust/total_population), use BPNL by the mode age group % (num_unique_cust/total_population_age_group)
5. groupby month/weeknum over transaction freq/gmv/profit, visualize

In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

import geopandas as gpd
import folium
from pyspark.sql import SparkSession, Window, functions as F
from pyspark.sql.functions import countDistinct, col, date_format
import numpy as np
import pyspark.sql.functions as func
from pyspark.sql.types import (
    StringType,
    LongType,
    DoubleType,
    StructField,
    StructType,
    FloatType
)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Start Spark Session
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 BNPL")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/07 20:44:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# load BNPL dataset
consumer = spark.read.csv("../data/tables/tbl_consumer.csv", header=True, sep="|")
details = spark.read.parquet("../data/tables/consumer_user_details.parquet")
merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")

In [4]:
# load all transactions datasets
paths=['../data/tables/transactions_20210228_20210827_snapshot',
       '../data/tables/transactions_20210828_20220227_snapshot']

first = 1
for path in paths:
    if first:
        transactions = spark.read.parquet(path)
        print(f'added {path.split("/")[3]}')
        first = 0
    else:
        append_transactions = spark.read.parquet(path)
        transactions = transactions.union(append_transactions)
        print(f'added {path.split("/")[3]}')

                                                                                

added transactions_20210228_20210827_snapshot


                                                                                

added transactions_20210828_20220227_snapshot


In [5]:
# load poa_to_sa2 dataset
poa_to_sa2 = spark.read.csv("../data/curated/poa_w_sa2.csv", header=True)

In [6]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [7]:
transactions.agg({'order_datetime': 'max'}).show()
transactions.agg({'order_datetime': 'min'}).show()

                                                                                

+-------------------+
|max(order_datetime)|
+-------------------+
|         2022-02-27|
+-------------------+





+-------------------+
|min(order_datetime)|
+-------------------+
|         2021-02-28|
+-------------------+



                                                                                

In [8]:
# rename columns
merchants = merchants.withColumnRenamed('name', 'merchant_name')
consumer = consumer.withColumnRenamed('name', 'consumer_name')


---
#### 1. Merge SA2 to transaction by postcode

In [9]:
# Join consumers with their respective details
consumer_detail = consumer.join(details, on="consumer_id")

# Join consumers with their respective transactions
consumer_trx = consumer_detail.join(transactions, on="user_id")

# Join transactions with the respective merchants
df_trx = consumer_trx.join(merchants, on="merchant_abn")

In [10]:
df_trx.show(5, vertical=True)



-RECORD 0------------------------------
 merchant_abn   | 33064796871          
 user_id        | 7                    
 consumer_id    | 511685               
 consumer_name  | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 dollar_value   | 373.0873675184212    
 order_id       | fe188788-b89f-4dd... 
 order_datetime | 2021-08-20           
 merchant_name  | Curabitur Massa C... 
 tags           | ((computer progra... 
-RECORD 1------------------------------
 merchant_abn   | 68435002949          
 user_id        | 7                    
 consumer_id    | 511685               
 consumer_name  | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 dollar_value   | 232.5364986739752    
 order_id       | b4a89891-a113-45e... 


                                                                                

In [11]:
df_trx.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- consumer_id: string (nullable = true)
 |-- consumer_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- tags: string (nullable = true)



In [12]:
poa_to_sa2.show(5, vertical=True)

-RECORD 0---------------------------------
 poa_code_2016     | 800                  
 poa_name_2016     | 0800                 
 sa2_maincode_2016 | 701011002.0          
 sa2_name_2016     | Darwin City          
 geometry          | POLYGON ((130.834... 
-RECORD 1---------------------------------
 poa_code_2016     | 810                  
 poa_name_2016     | 0810                 
 sa2_maincode_2016 | 701021013.0          
 sa2_name_2016     | Brinkin - Nakara     
 geometry          | POLYGON ((130.863... 
-RECORD 2---------------------------------
 poa_code_2016     | 812                  
 poa_name_2016     | 0812                 
 sa2_maincode_2016 | 701021014.0          
 sa2_name_2016     | Buffalo Creek        
 geometry          | POLYGON ((130.901... 
-RECORD 3---------------------------------
 poa_code_2016     | 815                  
 poa_name_2016     | 0815                 
 sa2_maincode_2016 | 701021013.0          
 sa2_name_2016     | Brinkin - Nakara     
 geometry  

In [13]:
# translate postcodes in transaction to sa2 codes
sa2_cols = ['poa_name_2016', 'sa2_maincode_2016', 'sa2_name_2016', 'geometry']
df_trx_sa2 = (df_trx \
                .join(poa_to_sa2[sa2_cols], 
                     on=[df_trx['postcode'] == poa_to_sa2['poa_name_2016']],
                     how='inner')
                .drop('poa_name_2016')
             )

---
#### 2. Check how many unique SA2

In [14]:
df_trx_sa2.select('sa2_maincode_2016').distinct().count()

                                                                                

1314

In [15]:
df_trx_sa2.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- consumer_id: string (nullable = true)
 |-- consumer_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- sa2_maincode_2016: string (nullable = true)
 |-- sa2_name_2016: string (nullable = true)
 |-- geometry: string (nullable = true)



---
#### 3. Check for null in SA2 values

In [16]:
dict_null = {col:df_trx_sa2.filter(df_trx_sa2[col].isNull()).count() 
             for col in df_trx_sa2.columns}
dict_null

                                                                                

{'merchant_abn': 0,
 'user_id': 0,
 'consumer_id': 0,
 'consumer_name': 0,
 'address': 0,
 'state': 0,
 'postcode': 0,
 'gender': 0,
 'dollar_value': 0,
 'order_id': 0,
 'order_datetime': 0,
 'merchant_name': 0,
 'tags': 0,
 'sa2_maincode_2016': 0,
 'sa2_name_2016': 0,
 'geometry': 12090}

### 4. Analyse by State (monthly):
- by Total Dollar Value, 
- Active Merchants, 
- Active Consumers,
- AOV (overall, male vs female), 
- BNPL user % (num_unique_cust/total_population_over_18)


In [19]:
df_trx_sa2.show(5, vertical=True)



-RECORD 0---------------------------------
 merchant_abn      | 23661821077          
 user_id           | 13882                
 consumer_id       | 151968               
 consumer_name     | Scott Dean           
 address           | 09010 Brandi Prairie 
 state             | NSW                  
 postcode          | 2016                 
 gender            | Male                 
 dollar_value      | 51.527409870273424   
 order_id          | f20fdc13-9500-483... 
 order_datetime    | 2021-08-19           
 merchant_name     | Suspendisse Eleif... 
 tags              | ((computer progra... 
 sa2_maincode_2016 | 117031335.0          
 sa2_name_2016     | Redfern - Chippen... 
 geometry          | POLYGON ((151.196... 
-RECORD 1---------------------------------
 merchant_abn      | 88202878932          
 user_id           | 13882                
 consumer_id       | 151968               
 consumer_name     | Scott Dean           
 address           | 09010 Brandi Prairie 
 state     

                                                                                

In [68]:
df_trx_sa2 = df_trx_sa2.withColumn("order_month", 
                     date_format(col("order_datetime"), "M").cast('INT'))

df_trx_sa2 = df_trx_sa2.withColumn("order_year", 
                     date_format(col("order_datetime"), "y").cast('INT'))


In [81]:
state_trx = (df_trx_sa2.groupby(['state', 'order_year', 'order_month'])
             .agg({'order_id':'count', 'dollar_value':'sum'})
             .sort(['state', 'order_year', 'order_month']))
unique_cons = (df_trx_sa2.groupby(['state', 'order_year', 'order_month'])
               .agg(countDistinct('consumer_id'))
               .sort(['state', 'order_year', 'order_month']))
unique_merc = (df_trx_sa2.groupby(['state', 'order_year', 'order_month'])
               .agg(countDistinct('merchant_abn'))
               .sort(['state', 'order_year', 'order_month']))

def join_agg(sdf1, sdf2):
    '''
        take two dataframes and join the two dataframes
    '''
    sdf1 = (sdf1.alias("a") \
               .join(sdf2, 
                     on=['state', 'order_year', 'order_month'], 
                     how='inner')
           )
    return sdf1
state_trx = join_agg(state_trx, unique_cons)
state_trx = join_agg(state_trx, unique_merc)
    
# renaming a few columns
field_name_change = {"sum(dollar_value)": "total_dollar_value", 
                     "count(order_id)": "transaction_freq",
                     "count(consumer_id)": "n_unique_consumer",
                     "count(merchant_abn)": "n_unique_merchant"}
for old, new in field_name_change.items():
    state_trx = state_trx.withColumnRenamed(old, new)

cols = ['state', 'order_year', 'order_month', 'n_unique_consumer', 
        'transaction_freq', 'total_dollar_value', 'n_unique_merchant']
state_trx = state_trx[cols].sort(['state', 'order_year', 'order_month'])

state_trx = (state_trx.
             withColumn('avg_sales_per_consumer', 
                        col("total_dollar_value") / col("n_unique_consumer")))

state_trx = (state_trx.
             withColumn('avg_sales_per_merchant', 
                        col("total_dollar_value") / col("n_unique_merchant")))

In [89]:
state_trx.write.csv('/Users/patrick/Downloads/state_trx.csv')

                                                                                

In [93]:
consumer_trx = (df_trx_sa2.groupby(['consumer_id', 'state', 'postcode', 
                                    'sa2_maincode_2016'])
                 .agg({'order_id':'count', 'dollar_value':'sum'})
                 .sort(['consumer_id']))

consumer_trx = (consumer_trx.
             withColumn('aov_consumer', 
                        col("sum(dollar_value)") / col("count(order_id)")))

In [96]:
# average consumer order value per transaction by state
consumer_trx.groupby('state').agg({'aov_consumer':'mean'})

                                                                                

state,avg(aov_consumer)
ACT,158.7767030922387
SA,158.38187416774943
TAS,159.05346106470913
WA,158.93517295210245
QLD,158.97512088335768
VIC,158.1636956303271
NSW,158.57322324864256


In [104]:
consumer_trx

[Stage 1297:>                                                       (0 + 1) / 1]

+-----+---------+
|state|   median|
+-----+---------+
|  ACT|154.34036|
|   SA|153.94508|
|  TAS|154.93332|
|   WA|155.44766|
|  QLD|154.76787|
|  VIC|153.99507|
|  NSW|154.43105|
+-----+---------+



                                                                                

In [105]:
poa_to_sa2.printSchema()

root
 |-- poa_code_2016: string (nullable = true)
 |-- poa_name_2016: string (nullable = true)
 |-- sa2_maincode_2016: string (nullable = true)
 |-- sa2_name_2016: string (nullable = true)
 |-- geometry: string (nullable = true)



---
### 5. Geospatial Visualization

In [109]:
poa_to_sa2 = pd.read_csv("../data/curated/poa_w_sa2.csv")

In [110]:
poa_to_sa2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   poa_code_2016      2669 non-null   int64  
 1   poa_name_2016      2669 non-null   int64  
 2   sa2_maincode_2016  2669 non-null   float64
 3   sa2_name_2016      2669 non-null   object 
 4   geometry           2664 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 104.4+ KB


In [124]:
sa2_trx = (df_trx_sa2.groupby(['sa2_maincode_2016', 'order_year', 'order_month'])
             .agg({'order_id':'count', 'dollar_value':'sum'})
             .sort(['sa2_maincode_2016', 'order_year', 'order_month']))
unique_cons = (df_trx_sa2.groupby(['sa2_maincode_2016', 'order_year', 'order_month'])
               .agg(countDistinct('consumer_id'))
               .sort(['sa2_maincode_2016', 'order_year', 'order_month']))
unique_merc = (df_trx_sa2.groupby(['sa2_maincode_2016', 'order_year', 'order_month'])
               .agg(countDistinct('merchant_abn'))
               .sort(['sa2_maincode_2016', 'order_year', 'order_month']))

def join_agg(sdf1, sdf2):
    '''
        take two dataframes and join the two dataframes
    '''
    sdf1 = (sdf1.alias("a") \
               .join(sdf2, 
                     on=['sa2_maincode_2016', 'order_year', 'order_month'], 
                     how='inner')
           )
    return sdf1
sa2_trx = join_agg(sa2_trx, unique_cons)
sa2_trx = join_agg(sa2_trx, unique_merc)
    
# renaming a few columns
field_name_change = {"sum(dollar_value)": "total_dollar_value", 
                     "count(order_id)": "transaction_freq",
                     "count(consumer_id)": "n_unique_consumer",
                     "count(merchant_abn)": "n_unique_merchant"}
for old, new in field_name_change.items():
    sa2_trx = sa2_trx.withColumnRenamed(old, new)

cols = ['sa2_maincode_2016', 'order_year', 'order_month', 'n_unique_consumer', 
        'transaction_freq', 'total_dollar_value', 'n_unique_merchant']
sa2_trx = sa2_trx[cols].sort(['sa2_maincode_2016', 'order_year', 'order_month'])

sa2_trx = (sa2_trx.
             withColumn('avg_sales_per_consumer', 
                        col("total_dollar_value") / col("n_unique_consumer")))

sa2_trx = (sa2_trx.
             withColumn('avg_sales_per_order', 
                        col("total_dollar_value") / col("transaction_freq")))

sa2_trx = (sa2_trx.
             withColumn('avg_sales_per_merchant', 
                        col("total_dollar_value") / col("n_unique_merchant")))


In [114]:
sa2_trx.show(5, vertical=True)

                                                                                

-RECORD 0------------------------------------
 sa2_maincode_2016      | 101021007.0        
 order_year             | 2021               
 order_month            | 2                  
 n_unique_consumer      | 4                  
 transaction_freq       | 6                  
 total_dollar_value     | 314.02321483179026 
 n_unique_merchant      | 6                  
 avg_sales_per_consumer | 78.50580370794756  
 avg_sales_per_merchant | 52.33720247196504  
-RECORD 1------------------------------------
 sa2_maincode_2016      | 101021007.0        
 order_year             | 2021               
 order_month            | 3                  
 n_unique_consumer      | 9                  
 transaction_freq       | 218                
 total_dollar_value     | 29431.817265521724 
 n_unique_merchant      | 157                
 avg_sales_per_consumer | 3270.2019183913026 
 avg_sales_per_merchant | 187.46380423899188 
-RECORD 2------------------------------------
 sa2_maincode_2016      | 10102100

In [159]:
poa_to_sa2 = poa_to_sa2.dropna()

In [160]:
from shapely import wkt

poa_to_sa2['geometry'] = poa_to_sa2['geometry'].astype('str').apply(wkt.loads)
gdf = gpd.GeoDataFrame(poa_to_sa2, crs='epsg:4326')


In [161]:
gdf['geometry'] = gdf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [163]:
# create a JSON 
geoJSON = gdf[['sa2_maincode_2016', 'geometry']].drop_duplicates('sa2_maincode_2016').to_json()

# print the first 300 chars of the json
print(geoJSON[:300])

{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"sa2_maincode_2016": 701011002.0}, "geometry": {"type": "Polygon", "coordinates": [[[130.83450871037445, -12.457986119222362], [130.8339087104066, -12.457386119213936], [130.8338087104126, -12.457186119211778], 


In [126]:
sa2_trx = sa2_trx.toPandas()

                                                                                

In [127]:
sa2_trx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17059 entries, 0 to 17058
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sa2_maincode_2016       17059 non-null  object 
 1   order_year              17059 non-null  int32  
 2   order_month             17059 non-null  int32  
 3   n_unique_consumer       17059 non-null  int64  
 4   transaction_freq        17059 non-null  int64  
 5   total_dollar_value      17059 non-null  float64
 6   n_unique_merchant       17059 non-null  int64  
 7   avg_sales_per_consumer  17059 non-null  float64
 8   avg_sales_per_order     17059 non-null  float64
 9   avg_sales_per_merchant  17059 non-null  float64
dtypes: float64(4), int32(2), int64(3), object(1)
memory usage: 1.2+ MB


In [None]:
# visualize trip_count by do_location id
m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=8)
sa2_trx_filter = sa2_trx.filter(F.col('state') == "VIC")

custom_scale = (sa2_trx_filter['total_dollar_value'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=sa2_trx, # data source
    columns=['sa2_maincode_2016','total_dollar_value'], # the columns required
    key_on='properties.sa2_maincode_2016', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Average Sales per Transaction'
)

c.add_to(m)

m