In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from pyspark.sql import Window

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/15 17:30:35 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.29.56.17 instead (on interface eth0)
22/10/15 17:30:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/15 17:30:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/15 17:30:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
sdf = spark.read.parquet("../data/curated/finaldf.parquet/")
sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-21,94729574738,16982,4821,23684,156.3091245999424,407ce815-54e1-4ae...,QLD,Female,Scelerisque Corpo...,computer programm...,4.51,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1
2022-04-24,45572698303,16982,4821,23684,115.38672513864692,72160f0d-06c5-432...,QLD,Female,Libero Proin Corp.,shoe shops,3.3,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1
2021-08-21,34695415993,16982,4821,23684,35.730567249104645,d4524ef6-1f75-4f2...,QLD,Female,Ultrices Vivamus ...,"cable, satellite,...",6.02,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1
2022-04-24,67400260923,16982,4821,23684,73.85811747094266,7acb9a9f-3ef6-4d4...,QLD,Female,Eleifend PC,computer programm...,5.97,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1
2021-08-19,86578477987,16982,4821,23684,41.45540307953735,e8be544a-a311-432...,QLD,Female,Leo In Consulting,"watch, clock, and...",6.43,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1


In [3]:
sdf.select('consumer_id').distinct().count()

20906

20,906 total consumers

Get number of consumers and total number of transactions for each merchant

In [4]:
merch_info = sdf.groupBy("merchant_abn").agg(
    F.count_distinct("consumer_id").alias("consumer_count"),
    F.count("consumer_id").alias("transactions_count")
)

merch_info.limit(5)

                                                                                

merchant_abn,consumer_count,transactions_count
83412691377,9338,12361
38700038932,5080,5838
73256306726,4045,4522
15613631617,1481,1540
12516851436,183,183


For each merchant, get the postcodes and the number of transactions originating from that postcode

In [5]:
merch_postcode = sdf.groupby('merchant_abn', 'postcode').count()
merch_postcode.orderBy("merchant_abn").limit(5)

                                                                                

merchant_abn,postcode,count
10023283211,4566,2
10023283211,3220,3
10023283211,1422,1
10023283211,3099,2
10023283211,2838,1


For each merchant, get the most frequent postcodes

In [6]:
w = Window.partitionBy('merchant_abn')

merch_postcode = merch_postcode \
    .withColumn('maxCount', F.max('count').over(w)) \
    .where(F.col('count') == F.col('maxCount')) \
    .drop('maxCount', 'count')

merch_postcode.limit(5)

                                                                                

merchant_abn,postcode
10023283211,2388
10023283211,3275
10023283211,5582
10346855916,5276
10346855916,5291


Join `merch_postcode` with population external data

In [7]:
external_sdf = spark.read.parquet("../data/curated/externaldata.parquet/")
external_sdf = external_sdf.drop(*('num_earners', 'median_age', 'median_income', 'mean_income'))
external_sdf.limit(5)

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
200,2875,6,1528,1292,47,2
800,7679,474,325,3322,2652,906
801,7679,474,325,3322,2652,906
804,2980,350,282,801,1047,500
810,2657,326,274,541,845,386


In [8]:
print(merch_postcode.count())
merch_demog = merch_postcode.join(external_sdf, "postcode")
print(merch_demog.count())

                                                                                

33745


[Stage 81:===>                                                    (1 + 16) / 17]

33745


                                                                                

In [9]:
merch_demog.limit(5)

                                                                                

postcode,merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
2388,10023283211,4656,629,601,751,1463,1212
3275,10023283211,9963,1127,1317,1319,3117,3083
5582,10023283211,4209,330,348,454,1176,1901
5276,10346855916,3943,357,361,542,1156,1527
5291,10346855916,6296,643,734,931,2280,1708


For merchant with multiple most frequent postcodes, get the average population for each age group

In [10]:
merch_demog = merch_demog.groupBy("merchant_abn").agg(
    F.round(F.avg("total_pop"), 2).alias("total_pop"),
    F.round(F.avg("under10_pop"), 2).alias("under10_pop"),
    F.round(F.avg("adolsc_pop"), 2).alias("adolsc_pop"),
    F.round(F.avg("yng_adult_pop"), 2).alias("yng_adult_pop"),
    F.round(F.avg("mid_age_pop"), 2).alias("mid_age_pop"),
    F.round(F.avg("old_pop"), 2).alias("old_pop"),
)

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,6276.0,695.33,755.33,841.33,1918.67,2065.33
10346855916,11618.86,1405.29,1509.57,1881.0,3777.14,3045.86
10385163239,26723.0,2652.0,2696.0,3439.0,7889.0,10047.0
10648956813,7784.0,772.0,969.5,999.0,2407.0,2636.5
10714068705,9525.0,1146.0,1181.0,1283.0,3118.0,2797.0


Have obtained average statistics for consumer demographics for each merchant based on postcode

***

Classifiy the demographic statistics into groups based on quartiles

In [11]:
col_names = list(merch_demog.schema.names)
col_names.pop(0)
col_names

['total_pop',
 'under10_pop',
 'adolsc_pop',
 'yng_adult_pop',
 'mid_age_pop',
 'old_pop']

In [12]:
ext_data_quants_df = pd.DataFrame(index=col_names, columns=("LowerQuartile", "Median", "UpperQuartile"))
ext_data_quants_df

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,,,
under10_pop,,,
adolsc_pop,,,
yng_adult_pop,,,
mid_age_pop,,,
old_pop,,,


In [13]:
for ext_data_name in col_names:
    new_vals = list(merch_demog.toPandas()[ext_data_name].quantile([0.25, 0.5, 0.75]))
    ext_data_quants_df.loc[ext_data_name, ["LowerQuartile", "Median", "UpperQuartile"]] = new_vals

ext_data_quants_df

                                                                                

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,6264.5425,8717.44,10617.745
under10_pop,651.7075,953.835,1199.98
adolsc_pop,694.0,994.435,1242.66
yng_adult_pop,922.9375,1535.625,2073.9
mid_age_pop,1994.0,2811.29,3448.0
old_pop,1712.355,2243.98,2690.13


Classify values based on quartiles for each external data type
- group 1 : $<$ LowerQuartile
- group 2 : between LowerQuatile and Median
- group 3 : between Median and UpperQuartile
- group 4 : $>$ UpperQuartile

In [14]:
for ext_data_name in col_names:

    merch_demog = merch_demog.withColumn(
        ext_data_name,
        when(
            # less than lower quartile
            col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "LowerQuartile"],
            lit(1)
        )
        .when(
            # between lower quartile and median
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "LowerQuartile"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "Median"]),
            lit(2)
        )
        .when(
            # between median and upper quartile
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "Median"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "UpperQuartile"]),
            lit(3)
        )
        .when(
            # more than upper quartile
            col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "UpperQuartile"],
            lit(4)
        )
        .otherwise(lit(-1))
    )

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,2,2,2,1,1,2
10346855916,4,4,4,3,4,4
10385163239,4,4,4,4,4,4
10648956813,2,2,2,2,2,3
10714068705,3,3,3,2,3,4


In [15]:
merch_demog.write.mode("overwrite").parquet("../data/curated/merch_pop.parquet")

[Stage 183:>                                                      (0 + 16) / 17]

22/10/15 17:31:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers
22/10/15 17:31:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 90.08% for 15 writers
22/10/15 17:31:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 84.45% for 16 writers
22/10/15 17:31:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 90.08% for 15 writers
22/10/15 17:31:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers


                                                                                