In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import re
import glob
import math
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from collections import defaultdict
from multiprocessing import Manager

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "7g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .appName('exploration') \
    .getOrCreate()
# spark.conf.set("spark.sql.session.timeZone", "America/New_York")


In [None]:
transactions1 = spark.read.parquet("../data/transactions/transactions_20210228_20210827_snapshot")
transactions2 = spark.read.parquet("../data/transactions/transactions_20210828_20220227_snapshot")
transactions3 = spark.read.parquet("../data/transactions/transactions_20220228_20220828_snapshot")

transactions = transactions1.union(transactions2).union(transactions3)
# transactions = transactions.where(F.col("merchant_abn") == 62191208634)
transactions.show(truncate=False)
transactions.summary().show(truncate=False)

In [None]:
transactions_df = transactions.groupby("order_datetime").count().sort("order_datetime").to_pandas_on_spark()
transactions_df.plot.line("order_datetime", "count")

In [None]:
consumer_fraud = spark.read.option("header", True).csv("../data/tables/consumer_fraud_probability.csv")
consumer_fraud.show()
consumer_fraud.summary().show()

In [None]:
consumer_details = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_details.show()

In [None]:
merchant_fraud = spark.read.option("header", True).csv("../data/tables/merchant_fraud_probability.csv")
merchant_fraud.show()
merchant_fraud.summary().show()

In [None]:
tbl_consumer = spark.read.option("header", True).option("delimiter", "|").csv("../data/tables/tbl_consumer.csv")
tbl_consumer.show(truncate=False)
tbl_consumer.summary().show()

In [None]:
tbl_merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")
tbl_merchants = tbl_merchants.withColumn('tags', F.regexp_replace('tags', r'\(', r'\[')) \
    .withColumn('tags', F.lower(F.regexp_replace('tags', r'\)', r'\]')))

tbl_merchants = tbl_merchants.withColumn('tags1', (F.regexp_extract('tags', r'\[(\[[^\]]*\])[^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=1)))
tbl_merchants = tbl_merchants.withColumn('tags2', (F.regexp_extract('tags', r'\[(\[[^\]]*\])[^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=2)))
tbl_merchants = tbl_merchants.withColumn('tags3', (F.regexp_extract('tags', r'\[(\[[^\]]*\])[^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=3)).cast(DoubleType()))
tbl_merchants.sort('tags3').show(truncate=False)

In [None]:
tbl_merchants_counts = tbl_merchants.groupby('tags2', 'tags3').count()
tbl_merchants_counts.select('tags2', 'tags3', 'count').to_pandas_on_spark().corr()

In [None]:
# https://www.abs.gov.au/census/find-census-data/datapacks?release=2021&product=GCP&geography=SA2&header=S
TABLES = ['2021Census_G02_AUST_SA2.csv', '2021Census_G04A_AUST_SA2.csv', '2021Census_G04B_AUST_SA2.csv']
SELECTED_COLUMNS = ['SA2_CODE_2021', 'Median_age_persons', 'Median_tot_fam_inc_weekly', 'Median_tot_hhd_inc_weekly', 'Average_household_size', 'Tot_M', 'Tot_P', 'Tot_F']

census_df = None
for table in TABLES:
    df = spark.read.options(header=True) \
        .csv(f"../data/landing/2021_GCP_SA2_for_AUS_short-header/2021 Census GCP Statistical Area 2 for AUS/{table}")
    if census_df is None:
        census_df = df
    else:
        census_df = census_df.join(df, [
            census_df.SA2_CODE_2021 == df.SA2_CODE_2021
        ]).drop(df.SA2_CODE_2021)
    
census_df = census_df.select(*SELECTED_COLUMNS)
for column in SELECTED_COLUMNS[1:]:
    census_df = census_df.withColumn(column, F.col(column).cast(DoubleType()))
census_df.show()
census_df.summary().show()
census_df.write.mode('overwrite').parquet('../data/raw/census2021.parquet')

In [None]:
import folium

# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files
# Plot choropleth maps for each selected column
sa2_df = gpd.read_file('../data/landing/SA2_2021_AUST_SHP_GDA2020')

census_df = pd.read_parquet('../data/raw/census2021.parquet')
sa2_df = sa2_df.merge(census_df, left_on='SA2_CODE21', right_on='SA2_CODE_2021')

sa2_df = sa2_df.dropna()

m = folium.Map(location=[-25.2744, 133.7751], zoom_start=4)  # Coordinates and zoom level for Australia
# for column in SELECTED_COLUMNS[1:]:
folium.Choropleth(
    geo_data=sa2_df,
    data=sa2_df,
    columns=['SA2_CODE21', 'Tot_P'],
    key_on='feature.properties.SA2_CODE21',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    nan_fill_color='gray',
    legend_name='Total Population'
).add_to(m)
m