In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from pyspark.sql import Window

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/12 15:49:16 WARN Utils: Your hostname, DESKTOP-LNDD2A2 resolves to a loopback address: 127.0.1.1; using 172.21.140.219 instead (on interface eth0)
22/10/12 15:49:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/12 15:49:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sdf = spark.read.parquet("../data/curated/finaldf.parquet/")
sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-21,80682333501,3146,5651,604753,0.3672339667473312,2a59c978-f760-42d...,SA,Male,Orci Corp.,florists supplies...,4.88,b,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,99478391356,3146,5651,604753,3035.1695642706595,82e100bc-25c2-4e3...,SA,Male,Orci Quis Foundation,"equipment, tool, ...",1.52,c,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,86578477987,3146,5651,604753,61.05946896765003,9e3c8e62-9e8e-4e8...,SA,Male,Leo In Consulting,"watch, clock, and...",6.43,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-14,32361057556,3146,5651,604753,155.3456409871304,e4ff9499-e96d-4e6...,SA,Male,Orci In Consequat...,"gift, card, novel...",6.61,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-16,20445424481,3146,5651,604753,57.66971365811276,4a36f2ed-7bcc-43d...,SA,Male,Amet Industries,digital goods: bo...,6.29,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1


In [3]:
sdf = sdf.withColumns({
    "year": date_format(col("order_datetime"), "yyyy").cast("long"),
    "month": date_format(col("order_datetime"), "MM").cast("long"),
    "day": date_format(col("order_datetime"), "dd").cast("long")
})

sdf.limit(5)

22/10/12 15:49:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%,year,month,day
2021-08-21,80682333501,3146,5651,604753,0.3672339667473312,2a59c978-f760-42d...,SA,Male,Orci Corp.,florists supplies...,4.88,b,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1,2021,8,21
2021-08-19,99478391356,3146,5651,604753,3035.1695642706595,82e100bc-25c2-4e3...,SA,Male,Orci Quis Foundation,"equipment, tool, ...",1.52,c,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1,2021,8,19
2021-08-19,86578477987,3146,5651,604753,61.05946896765003,9e3c8e62-9e8e-4e8...,SA,Male,Leo In Consulting,"watch, clock, and...",6.43,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1,2021,8,19
2021-08-14,32361057556,3146,5651,604753,155.3456409871304,e4ff9499-e96d-4e6...,SA,Male,Orci In Consequat...,"gift, card, novel...",6.61,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1,2021,8,14
2021-08-16,20445424481,3146,5651,604753,57.66971365811276,4a36f2ed-7bcc-43d...,SA,Male,Amet Industries,digital goods: bo...,6.29,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1,2021,8,16


In [4]:
sdf.select('consumer_id').distinct().count()

                                                                                

20906

20,906 total consumers

Get number of consumers and total number of transactions for each merchant

In [5]:
merch_info = sdf.groupBy("merchant_abn").agg(
    F.count_distinct("consumer_id").alias("consumer_count"),
    F.count("consumer_id").alias("transactions_count")
)

merch_info.limit(5)

                                                                                

merchant_abn,consumer_count,transactions_count
38700038932,5080,5838
83412691377,9338,12361
73256306726,4045,4522
38986645707,38,38
12516851436,183,183


For each merchant, get the postcodes and the number of transactions originating from that postcode

In [6]:
merch_postcode = sdf.groupby('merchant_abn', 'postcode').count()
merch_postcode.orderBy("merchant_abn").limit(5)

                                                                                

merchant_abn,postcode,count
10023283211,2379,1
10023283211,2828,1
10023283211,6942,3
10023283211,5452,2
10023283211,7315,3


For each merchant, get the most frequent postcodes

In [7]:
w = Window.partitionBy('merchant_abn')

merch_postcode = merch_postcode \
    .withColumn('maxCount', F.max('count').over(w)) \
    .where(F.col('count') == F.col('maxCount')) \
    .drop('maxCount', 'count')

merch_postcode.limit(5)

                                                                                

merchant_abn,postcode
10023283211,3275
10023283211,2388
10023283211,5582
10323485998,4856
10342410215,6237


In [8]:
external_sdf = spark.read.parquet("../data/curated/externaldata.parquet/")
external_sdf.limit(5)

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875,6,1528,1292,47,2,548,23,9306,16835
800,7679,474,325,3322,2652,906,5909,33,60937,87791
801,7679,474,325,3322,2652,906,5909,33,60937,87791
804,2980,350,282,801,1047,500,1873,40,75219,98872
810,2657,326,274,541,845,386,1479,39,58753,67299


In [9]:
print(merch_postcode.count())
merch_demog = merch_postcode.join(external_sdf, "postcode")
print(merch_demog.count())

                                                                                

33745


[Stage 90:>                                                         (0 + 4) / 4]

33745


                                                                                

In [10]:
merch_demog.limit(5)

                                                                                

postcode,merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
3275,10023283211,9963,1127,1317,1319,3117,3083,5765,48,43136,52080
2388,10023283211,4656,629,601,751,1463,1212,2656,46,43821,51530
5582,10023283211,4209,330,348,454,1176,1901,2021,53,36420,46556
4856,10323485998,8083,909,974,1066,2703,2431,4537,47,42112,48671
6237,10342410215,3131,315,543,341,1139,793,1915,47,47678,58959


In [11]:
merch_demog = merch_demog.groupBy("merchant_abn").agg(
    F.round(F.avg("total_pop"), 2).alias("total_pop"),
    F.round(F.avg("under10_pop"), 2).alias("under10_pop"),
    F.round(F.avg("adolsc_pop"), 2).alias("adolsc_pop"),
    F.round(F.avg("yng_adult_pop"), 2).alias("yng_adult_pop"),
    F.round(F.avg("mid_age_pop"), 2).alias("mid_age_pop"),
    F.round(F.avg("old_pop"), 2).alias("old_pop"),
)

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,6276.0,695.33,755.33,841.33,1918.67,2065.33
10323485998,8083.0,909.0,974.0,1066.0,2703.0,2431.0
10342410215,4665.67,534.0,589.33,604.0,1668.0,1208.0
10346855916,11618.86,1405.29,1509.57,1881.0,3777.14,3045.86
10385163239,26723.0,2652.0,2696.0,3439.0,7889.0,10047.0


Have obtained average statistics for consumer demographics for each merchant based on postcode

***

Classifiy the demographic statistics into groups based on quartiles

In [12]:
col_names = list(merch_demog.schema.names)
col_names.pop(0)
col_names

['total_pop',
 'under10_pop',
 'adolsc_pop',
 'yng_adult_pop',
 'mid_age_pop',
 'old_pop']

In [13]:
ext_data_quants_df = pd.DataFrame(index=col_names, columns=("LowerQuartile", "Median", "UpperQuartile"))
ext_data_quants_df

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,,,
under10_pop,,,
adolsc_pop,,,
yng_adult_pop,,,
mid_age_pop,,,
old_pop,,,


In [14]:
for ext_data_name in col_names:
    new_vals = list(merch_demog.toPandas()[ext_data_name].quantile([0.25, 0.5, 0.75]))
    ext_data_quants_df.loc[ext_data_name, ["LowerQuartile", "Median", "UpperQuartile"]] = new_vals

ext_data_quants_df

                                                                                

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,6264.5425,8717.44,10617.745
under10_pop,651.7075,953.835,1199.98
adolsc_pop,694.0,994.435,1242.66
yng_adult_pop,922.9375,1535.625,2073.9
mid_age_pop,1994.0,2811.29,3448.0
old_pop,1712.355,2243.98,2690.13


Classify values based on quartiles for each external data type
- group 1 : $<$ LowerQuartile
- group 2 : between LowerQuatile and Median
- group 3 : between Median and UpperQuartile
- group 4 : $>$ UpperQuartile

In [15]:
for ext_data_name in col_names:

    merch_demog = merch_demog.withColumn(
        ext_data_name,
        when(
            # less than lower quartile
            col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "LowerQuartile"],
            lit(1)
        )
        .when(
            # between lower quartile and median
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "LowerQuartile"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "Median"]),
            lit(2)
        )
        .when(
            # between median and upper quartile
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "Median"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "UpperQuartile"]),
            lit(3)
        )
        .when(
            # more than upper quartile
            col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "UpperQuartile"],
            lit(4)
        )
        .otherwise(lit(-1))
    )

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,2,2,2,1,1,2
10323485998,2,2,2,2,2,3
10342410215,1,1,1,1,1,1
10346855916,4,4,4,3,4,4
10385163239,4,4,4,4,4,4


In [16]:
merch_demog.write.mode("overwrite").parquet("../data/curated/merch_pop.parquet")

                                                                                