In [97]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from pyspark.sql import Window

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

In [98]:
sdf = spark.read.parquet("../data/curated/finaldf.parquet/")
sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1


In [99]:
sdf = sdf.withColumns({
    "year": date_format(col("order_datetime"), "yyyy").cast("long"),
    "month": date_format(col("order_datetime"), "MM").cast("long"),
    "day": date_format(col("order_datetime"), "dd").cast("long")
})

sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%,year,month,day
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1,2022,7,13
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1,2021,8,19
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1,2022,7,13
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1,2021,8,19
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1,2022,5,6


In [100]:
sdf.select('consumer_id').distinct().count()

20906

20,906 total consumers

Get number of consumers and total number of transactions for each merchant

In [101]:
merch_info = sdf.groupBy("merchant_abn").agg(
    F.count_distinct("consumer_id").alias("consumer_count"),
    F.count("consumer_id").alias("transactions_count")
)

merch_info.limit(5)

                                                                                

merchant_abn,consumer_count,transactions_count
73256306726,4045,4522
83412691377,9338,12361
38700038932,5080,5838
38986645707,38,38
12516851436,183,183


For each merchant, get the postcodes and the number of transactions originating from that postcode

In [102]:
merch_postcode = sdf.groupby('merchant_abn', 'postcode').count()
merch_postcode.orderBy("merchant_abn").limit(5)

                                                                                

merchant_abn,postcode,count
10023283211,4559,1
10023283211,5168,1
10023283211,3415,2
10023283211,2019,1
10023283211,6461,1


For each merchant, get the most frequent postcodes

In [103]:
w = Window.partitionBy('merchant_abn')

merch_postcode = merch_postcode \
    .withColumn('maxCount', F.max('count').over(w)) \
    .where(F.col('count') == F.col('maxCount')) \
    .drop('maxCount', 'count')

merch_postcode.limit(5)

                                                                                

merchant_abn,postcode
10023283211,3275
10023283211,2388
10023283211,5582
10342410215,6237
10342410215,6335


In [104]:
external_sdf = spark.read.parquet("../data/curated/externaldata.parquet/")
external_sdf.limit(5)

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875,6,1528,1292,47,2,548,23,9306,16835
800,7679,474,325,3322,2652,906,5909,33,60937,87791
801,7679,474,325,3322,2652,906,5909,33,60937,87791
804,2980,350,282,801,1047,500,1873,40,75219,98872
810,2657,326,274,541,845,386,1479,39,58753,67299


In [105]:
print(merch_postcode.count())
merch_demog = merch_postcode.join(external_sdf, "postcode")
print(merch_demog.count())

                                                                                

33745


[Stage 1323:>                                                       (0 + 8) / 8]

33745


                                                                                

In [106]:
merch_demog.limit(5)

                                                                                

postcode,merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
3275,10023283211,9963,1127,1317,1319,3117,3083,5765,48,43136,52080
2388,10023283211,4656,629,601,751,1463,1212,2656,46,43821,51530
5582,10023283211,4209,330,348,454,1176,1901,2021,53,36420,46556
6237,10342410215,3131,315,543,341,1139,793,1915,47,47678,58959
6335,10342410215,2958,479,295,525,1014,645,1801,44,58087,78139


In [107]:
merch_demog = merch_demog.groupBy("merchant_abn").agg(
    F.round(F.avg("total_pop"), 2).alias("total_pop"),
    F.round(F.avg("under10_pop"), 2).alias("under10_pop"),
    F.round(F.avg("adolsc_pop"), 2).alias("adolsc_pop"),
    F.round(F.avg("yng_adult_pop"), 2).alias("yng_adult_pop"),
    F.round(F.avg("mid_age_pop"), 2).alias("mid_age_pop"),
    F.round(F.avg("old_pop"), 2).alias("old_pop"),
)

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,6276.0,695.33,755.33,841.33,1918.67,2065.33
10342410215,4665.67,534.0,589.33,604.0,1668.0,1208.0
10346855916,11618.86,1405.29,1509.57,1881.0,3777.14,3045.86
10385163239,26723.0,2652.0,2696.0,3439.0,7889.0,10047.0
10648956813,7784.0,772.0,969.5,999.0,2407.0,2636.5


Have obtained average statistics for consumer demographics for each merchant based on postcode

***

Classifiy the demographic statistics into groups based on quartiles

In [108]:
col_names = list(merch_demog.schema.names)
col_names.pop(0)
col_names

['total_pop',
 'under10_pop',
 'adolsc_pop',
 'yng_adult_pop',
 'mid_age_pop',
 'old_pop']

In [109]:
ext_data_quants_df = pd.DataFrame(index=col_names, columns=("LowerQuartile", "Median", "UpperQuartile"))
ext_data_quants_df

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,,,
under10_pop,,,
adolsc_pop,,,
yng_adult_pop,,,
mid_age_pop,,,
old_pop,,,


In [110]:
for ext_data_name in col_names:
    new_vals = list(merch_demog.toPandas()[ext_data_name].quantile([0.25, 0.5, 0.75]))
    ext_data_quants_df.loc[ext_data_name, ["LowerQuartile", "Median", "UpperQuartile"]] = new_vals

ext_data_quants_df

                                                                                

Unnamed: 0,LowerQuartile,Median,UpperQuartile
total_pop,6264.5425,8717.44,10617.745
under10_pop,651.7075,953.835,1199.98
adolsc_pop,694.0,994.435,1242.66
yng_adult_pop,922.9375,1535.625,2073.9
mid_age_pop,1994.0,2811.29,3448.0
old_pop,1712.355,2243.98,2690.13


Classify values based on quartiles for each external data type
- group 1 : $<$ LowerQuartile
- group 2 : between LowerQuatile and Median
- group 3 : between Median and UpperQuartile
- group 4 : $>$ UpperQuartile

In [111]:
for ext_data_name in col_names:

    merch_demog = merch_demog.withColumn(
        ext_data_name,
        when(
            # less than lower quartile
            col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "LowerQuartile"],
            lit(1)
        )
        .when(
            # between lower quartile and median
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "LowerQuartile"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "Median"]),
            lit(2)
        )
        .when(
            # between median and upper quartile
            (col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "Median"]) &
            (col(ext_data_name) < ext_data_quants_df.loc[ext_data_name, "UpperQuartile"]),
            lit(3)
        )
        .when(
            # more than upper quartile
            col(ext_data_name) > ext_data_quants_df.loc[ext_data_name, "UpperQuartile"],
            lit(4)
        )
        .otherwise(lit(-1))
    )

merch_demog.limit(5)

                                                                                

merchant_abn,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop
10023283211,2,2,2,1,1,2
10342410215,1,1,1,1,1,1
10346855916,4,4,4,3,4,4
10385163239,4,4,4,4,4,4
10648956813,2,2,2,2,2,3


In [112]:
merch_demog.write.mode("overwrite").parquet("../data/curated/merch_pop.parquet")

                                                                                