In [20]:
import geopandas as gpd
import os
os.sys.path.append("../")
from scripts.geospatial_analysis import *


from pyspark.sql import functions as F, SparkSession

In [21]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("geospatial analysis")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

In [22]:
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_rate.parquet')

In [23]:
# Find info for fraudulent consumer
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")

In [24]:
# Number of consumer in each state
consumer_info.groupby("state").agg(F.count("consumer_id"))

                                                                                

state,count(consumer_id)
NT,7764
ACT,4664
SA,54973
TAS,18878
WA,79146
QLD,72861
VIC,117525
NSW,144188


In [25]:
# Number of fraudulent consumer in each state
fraudulent_consumer_with_info.groupby("state").agg(F.count("consumer_id"))

state,count(consumer_id)
NT,480
ACT,313
SA,3926
TAS,1332
WA,5538
QLD,5109
VIC,8129
NSW,10037


In [26]:
fraudulent_consumer_with_info.count()
#consumer_info.count()
#fraudulent_consumer_with_info.select("consumer_id").distinct().count() # Number of unique fraudulent consumer

34864

In [27]:
# Check the number of NULL values
null_counts = fraudulent_consumer_with_info.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in consumer_fraud_with_info.columns])
null_counts.limit(1)
# null_counts = consumer_info.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in consumer_info.columns])
# null_counts.limit(1)


consumer_id,name,gender,state,postcode,order_datetime,fraud_probability
0,0,0,0,0,0,0


In [28]:
# Number of consumer in each postcode
consumer_info.groupBy(["postcode"]).agg(F.count("consumer_id").alias("total_consumer"))

                                                                                

postcode,total_consumer
5156,173
1238,162
3175,163
6357,161
2122,170
6336,157
6620,177
7253,171
6397,142
2142,150


ABS Digital Boundary files

`Postal Areas - 2021 - Shapefile`

https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files

In [29]:
# Load the shapefile into a GeoDataFrame
shapefile_path = '../data/raw/POA_2021_AUST_GDA2020_SHP/POA_2021_AUST_GDA2020.shp'
gdf = gpd.read_file(shapefile_path)

In [30]:
# Number of (fraudulent) consumer in each postcode
consumer_group_by_postcode = consumer_info.groupBy(["postcode"]).agg(F.count("consumer_id").alias("total_consumer"))
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.count("consumer_id").alias("total_consumer"))

In [31]:
consumer_group_by_postcode = consumer_group_by_postcode.toPandas()
fraudulent_consumer_group_by_postcode = fraudulent_consumer_group_by_postcode.toPandas()

consumer_group_by_postcode['postcode'] = consumer_group_by_postcode['postcode'].astype(str)
fraudulent_consumer_group_by_postcode['postcode'] = fraudulent_consumer_group_by_postcode['postcode'].astype(str)

gdf['POA_CODE21'] = gdf['POA_CODE21'].astype(str)

In [32]:
# Merge consumer information with geographic data from the shapefile
total_consumer_merged_gdf = gdf.merge(consumer_group_by_postcode, left_on='POA_CODE21', right_on='postcode')
total_consumer_geojson = total_consumer_merged_gdf.to_json()

fraudulent_consumer_merged_gdf = gdf.merge(fraudulent_consumer_group_by_postcode, left_on='POA_CODE21', right_on='postcode')
fraudulent_consumer_geojson = fraudulent_consumer_merged_gdf.to_json()

In [None]:
# Distribution of customer group by postcode
# Computer might be lagging, better to clear output of this cell
create_consumer_postcode_map(total_consumer_geojson,consumer_group_by_postcode)

In [None]:
# Computer might be lagging, better to clear output of this cell
create_consumer_postcode_map(fraudulent_consumer_geojson,fraudulent_consumer_group_by_postcode)

Other external postcode dataset 

`ASGS Edition 3 Structures URL`

`Postal Areas - 2021`

`Mesh Blocks - 2021`

https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files

`URL for Australian postcodes`

https://www.matthewproctor.com/australian_postcodes



In [None]:
# Takes 4 mins to read
# postcode_df = pd.read_excel("../data/raw/postcode.xlsx")
# mesh_blocks = pd.read_excel("../data/raw/MB_2021_AUST.xlsx")


# df = spark.read.csv(
#     "../data/raw/australian_postcodes.csv",
#     header=True, 
#     inferSchema=True, 
#     sep=",", 
#     quote='"', 
#     escape='"',
#     nullValue="NULL" 
# )

In [None]:
# Many suburbs will share one postcode, I just choose the first suburb corresponding to the postcode
# postcode_df = postcode_df.groupby('POA_CODE_2021', as_index=False).agg({
#     'MB_CODE_2021': 'first' 
# })

In [None]:
# mesh_blocks = mesh_blocks[["MB_CODE_2021", "SA2_NAME_2021", "SA3_NAME_2021", "SA4_NAME_2021", "GCCSA_NAME_2021", "STATE_NAME_2021"]]
