# Exploratory Data Analysis

In [2]:
# Initialise a spark session
import pandas as pd
from collections import Counter
import os
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import geopandas as gpd
import folium


spark = (
    SparkSession.builder.appName("EDA")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "16g")  # Increase driver memory
    .config("spark.executor.memory", "16g")  # Increase executor memory
    .config("spark.executor.instances", "4")  # Increase the number of executor instances
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()
)

In [4]:
# Read Files
transactions = spark.read.parquet('../data/curated/flagged_fraud')
sa4_shapefile_path = '../data/external/SA4_2021_AUST_GDA2020.shp'
sa4_shapefile = gpd.read_file(sa4_shapefile_path)

In [5]:
transactions.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- year_week: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- fraud_probability_consumer: double (nullable = true)
 |-- name_consumer: string (nullable = true)
 |-- address_consumer: string (nullable = true)
 |-- state_consumer: string (nullable = true)
 |-- postcode_consumer: integer (nullable = true)
 |-- gender_consumer: string (nullable = true)
 |-- name_merchant: string (nullable = true)
 |-- fraud_probability_merchant: double (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- order_month_year: string (nullable = true)
 |-- SA4_CODE_2011: string (nullable = true)
 |-- SA4_NAME_2011: string (nullable = true)
 |-- unemployment_rate: string (nullable = true)
 |-- consumer_weekly_transaction: long (nullable = true)
 |-- merchant_weekly_transaction: long (nullable = true)
 |-- is_

## Aggregating Dataset by SA4 Code

In [9]:
# Filter out rows where SA4 code is 'Unregistered'
filtered_transactions = transactions.filter(transactions['SA4_CODE_2011'] != 'Unregistered')

# Group by 'SA4_CODE_2011' and aggregate data
sa4_grouped_df = filtered_transactions.groupBy('SA4_CODE_2011').agg(
    F.count('order_id').alias('total_transactions'),
    F.countDistinct('user_id').alias('total_customers'),
    F.sum(F.when(F.col('is_fraud') == True, 1).otherwise(0)).alias('fraudulent_transactions'),
    F.avg('dollar_value').alias('avg_order_value'),
    F.mode('merchant_category').alias('popular_merchant_category')
)

sa4_grouped_df = sa4_grouped_df.toPandas()

                                                                                

## Generate Maps

In [10]:
# Merge shapefile with transactions data on SA4 code
merged_sa4 = sa4_shapefile.merge(sa4_grouped_df, left_on='SA4_CODE21', right_on='SA4_CODE_2011', how='left')

# Fill NA values with 0 for transaction columns
merged_sa4['total_transactions'].fillna(0, inplace=True)
merged_sa4['total_customers'].fillna(0, inplace=True)
merged_sa4['fraudulent_transactions'].fillna(0, inplace=True)

# Get the centroid of the SA4 map to initialize the folium map
center = merged_sa4.geometry.centroid.unary_union.centroid

  center = merged_sa4.geometry.centroid.unary_union.centroid


### Plotting Areas with Most Transactions

In [19]:
m = folium.Map(location=[center.y, center.x], zoom_start=5)

folium.Choropleth(
    geo_data=merged_sa4.to_json(),
    name='choropleth',
    data=merged_sa4,
    columns=['SA4_CODE21', 'total_transactions'],
    key_on='feature.properties.SA4_CODE21',
    fill_color='YlGnBu', 
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Total Transactions',
).add_to(m)

# Add drop pins for top 3 areas with the most transactions
top_areas = merged_sa4.nlargest(3, 'total_transactions')
for idx, row in top_areas.iterrows():
    folium.Marker(
        location=[row['geometry'].centroid.y, row['geometry'].centroid.x],
        popup=f"Area: {row['SA4_NAME21']}\nTotal Transactions: {row['total_transactions']}",
    ).add_to(m)

folium.LayerControl().add_to(m)

# Comment out to show map
#m

<folium.map.LayerControl at 0x3618ce6d0>

### Plotting Areas with Most Customers

In [18]:
m2 = folium.Map(location=[center.y, center.x], zoom_start=5)

folium.Choropleth(
    geo_data=merged_sa4.to_json(),
    name='choropleth',
    data=merged_sa4,
    columns=['SA4_CODE21', 'total_customers'],
    key_on='feature.properties.SA4_CODE21',
    fill_color='YlGnBu', 
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Total Customers',
).add_to(m2)

# Add drop pins for top 3 areas with the most customers
top_areas = merged_sa4.nlargest(3, 'total_customers')
for idx, row in top_areas.iterrows():
    folium.Marker(
        location=[row['geometry'].centroid.y, row['geometry'].centroid.x],
        popup=f"Area: {row['SA4_NAME21']}\nTotal Customers: {row['total_customers']}",
    ).add_to(m2)

folium.LayerControl().add_to(m2)

# Comment out to show map
#m2

<folium.map.LayerControl at 0x3488a4f10>

### Areas with Most Fraudulent Transactions

In [17]:
m3 = folium.Map(location=[center.y, center.x], zoom_start=5)

folium.Choropleth(
    geo_data=merged_sa4.to_json(),
    name='choropleth',
    data=merged_sa4,
    columns=['SA4_CODE21', 'fraudulent_transactions'],
    key_on='feature.properties.SA4_CODE21',
    fill_color='YlGnBu', 
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Fraudulent Transactions',
).add_to(m3)

# Add drop pins for top 3 areas with the most transactions
top_areas = merged_sa4.nlargest(3, 'fraudulent_transactions')
for idx, row in top_areas.iterrows():
    folium.Marker(
        location=[row['geometry'].centroid.y, row['geometry'].centroid.x],
        popup=f"Area: {row['SA4_NAME21']}\nFraudulent Transactions: {row['fraudulent_transactions']}",
    ).add_to(m3)

folium.LayerControl().add_to(m3)

# Comment out to show map
#m3

<folium.map.LayerControl at 0x329b44ed0>