# Summary of Analysis

In [2]:
# Open Spark session 
from pyspark.sql import SparkSession, functions as F


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/10/05 19:19:36 WARN Utils: Your hostname, Loky-PC resolves to a loopback address: 127.0.1.1; using 192.168.55.225 instead (on interface eth0)
22/10/05 19:19:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 19:19:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import pandas as pd
import re

# Reading raw data


In [4]:
#read internal tables

#dataframes: transaction_sample, merchants_tbl, customer_tbl  

transactions_sample = spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot')
transactions_sample2 = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
transactions_sample3 = spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot')
transactions_sample.unionByName(transactions_sample2, True)
transactions_sample.unionByName(transactions_sample3, True)
consumer_details = spark.read.parquet('../data/tables/consumer_user_details.parquet')
merchants_tbl = spark.read.parquet('../data/tables/tbl_merchants.parquet')
customer_tbl = spark.read.option("delimiter", "|").option("header",True).csv('../data/tables/tbl_consumer.csv')

                                                                                

# parse "tag" of merchants

In [5]:
merchants_pd = merchants_tbl.toPandas()
# this function standardises the tags attribute, creating a list with the 'description', 'revenue band' and 'BNPL service charge'
def tag_extract(tag_string): 
    # first need to preprocess
    string =  re.sub('\[','(', tag_string.lower())
    string = re.sub('\]',')', string)
    # break the string into sections
    string_cut = string.split('),')
    new_string = []
    # first extract the description 
    new_string.append(str(string_cut[0].strip('((')))
    # second extract the band
    new_string.append(str(re.search(r'[a-z]',string_cut[1]).group()))
    # finally the take rate
    new_string.append(float(re.search(r'[0-9]+\.[0-9]+',string_cut[2]).group()))
    return(new_string)
################
# now we can run the algorithm
tags = merchants_pd['tags']
processed_tags = []
for i in tags:
    processed_tags.append(tag_extract(i))

                                                                                

In [6]:
merchant_tag= pd.DataFrame(processed_tags, columns=('Description', 'Earnings_Class', 'BNPL_Fee'))
merchants_pd = pd.concat([merchants_pd, merchant_tag], axis=1)
# drop the tags column 
merchants_pd.drop(columns='tags', inplace=True)

# and convert back to spark dataframe 
merchants_tbl = spark.createDataFrame(merchants_pd)

# Join all the internal datasets

In [7]:
# First lets look at the number of registered customers and merchants without any data in the dataset
customer_tbl = customer_tbl.join(consumer_details, ['consumer_id'])
full_dataset = transactions_sample.join(customer_tbl, on = ['user_id'])
merchants_tbl = merchants_tbl.withColumnRenamed('name','company_name')
full_dataset = full_dataset.join(merchants_tbl, ['merchant_abn'])
full_dataset.createOrReplaceTempView('full')
full_dataset.printSchema()


root
 |-- merchant_abn: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- consumer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Earnings_Class: string (nullable = true)
 |-- BNPL_Fee: double (nullable = true)



In [8]:
# lets add a day (mon,...), weekly & monthly attribute
import pyspark.sql.functions as F
full_dataset = full_dataset.withColumn('Day', F.dayofweek('order_datetime'))
full_dataset = full_dataset.withColumn('Month', F.month('order_datetime'))
# now we can also add the bnpl revenue from a transaction 
full_dataset = full_dataset.withColumn('BNPL_Revenue', F.col('dollar_value') * 0.01 * F.col('BNPL_Fee'))
full_dataset.createOrReplaceTempView('data')
# we can remove name, location and customerID for now, due to being unnnesesary attributes (although company_name could also be removed)
full_dataset = spark.sql("""
select merchant_abn, user_id, dollar_value, order_id, order_datetime, state, postcode, gender, company_name, 
        Description, Earnings_Class, BNPL_Fee, BNPL_Revenue, Day, Month, weekofyear(order_datetime) as weekofyear from data
""")

In [9]:
# Compare difference in records between joined table and raw data
full = full_dataset.count()
raw = transactions_sample.count()
print(f'Joined count: {full}, raw count: {raw}, difference {raw - full} or {round(100 * ((raw - full) / raw), 2)}%')

                                                                                

Joined count: 4323692, raw count: 4508106, difference 184414 or 4.09%


# number of transactions without valid merchants abns in the merchants table

In [10]:
# First lets look at the number of transactions without valid merchants abns in the merchants table
merchants_tbl.createOrReplaceTempView('merchants')
customer_tbl.createOrReplaceTempView('consumer')
transactions_sample.createOrReplaceTempView('trans')
missing = spark.sql("""
select count(*) from trans
where trans.merchant_abn not in (select merchant_abn from merchants)
""")
print("the number of transactions without valid merchants abns in the merchants table: ")
print(missing.head()[0])

print("percentage of all transactions:")
print(missing.head()[0]/transactions_sample.count())

the number of transactions without valid merchants abns in the merchants table: 


                                                                                

184414
percentage of all transactions:
0.04090720138346347
