# Outlier analysis
In this workbook, we aim to analyse the fraud probability of merchants, as well as analysing the distribution of housing and income data from postal areas from the 2021 Census.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import shapefile as shp
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import sys
sys.path.append('../scripts')
from preprocess_script import count_outliers

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Outlier Ânlysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

## 1. Merchant
1. merchant fraud probability falls beween 0 and 1 and seem reasonable. So no further outlier analysis.
2. further outlier analysis on merchant data joined with aggregated transaction data.

In [None]:
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")
print(f"Number of rows: {merchant.count()}")
merchant.printSchema()
merchant.limit(5)

In [None]:
for col in ['take_rate', 'sum(dollar_value)', 'count(dollar_value)', 'log_ratio', 'unscaled_earning']:
    print(f"{merchant.agg({col: 'max'})}")
    print(f"{merchant.agg({col: 'min'})}")

Min-max values for take_rate, log_ratio and unscaled_earning seem reasonable. Further inspect distribution by plotting.

In [None]:
# Convert Spark DataFrame to Pandas DataFrame (you need to adjust this to fit your actual DataFrame)
merchant_pd = merchant.select('take_rate', 'log_ratio', 'unscaled_earning').toPandas()

# Plot histograms for the selected columns
for col in ['take_rate', 'log_ratio', 'unscaled_earning']:
    plt.figure(figsize=(6, 4))
    plt.hist(merchant_pd[col], bins=20, alpha=0.7)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

unscaled_earning is non-normal, transform to make it normal before applying the log IQR outlier removal rule.

In [None]:
merchant = merchant.withColumn("log(unscaled_earning)", F.log(F.col("unscaled_earning")))
merchant.limit(5)

In [None]:
merchant_pd = merchant.select('take_rate', 'log_ratio', 'log(unscaled_earning)').toPandas()

col = "log(unscaled_earning)"
plt.figure(figsize=(6, 4))
plt.hist(merchant_pd[col], bins=20, alpha=0.7)
plt.title(f'Histogram of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
for col in ['take_rate', 'log_ratio', 'log(unscaled_earning)']:
    count_outliers(merchant, col)

### Conclusion
No confirmed outliers in joined merchant and aggregated transaction data. No removal of records.

## SA2 Datasets

In [None]:
median_mortgage = pd.read_csv("../data/curated/sa2_dataset/C21_G02_SA2_clean.csv")
median_mortgage.shape

In [None]:
median_mortgage.head(5)

In [None]:
household_income_weekly = pd.read_csv("../data/curated/sa2_dataset/C21_G33_SA2_clean.csv")
household_income_weekly.shape

# https://www.abs.gov.au/census/guide-census-data/census-dictionary/2021/variables-topic/income-and-work/total-household-income-weekly-hind
# https://www.abs.gov.au/census/guide-census-data/census-dictionary/2021/variables-topic/household-and-families/household-composition-hhcd

In [None]:
household_income_weekly.head(5)

In [None]:
print(household_income_weekly['household_income_weekly'].min())
print(household_income_weekly['household_income_weekly'].max())

# Print the sorted unique values
print(sorted(household_income_weekly['household_income_weekly'].unique()))

In [None]:
monthly_mortgage_repayments_ranges = pd.read_csv("../data/curated/sa2_dataset/C21_G38_SA2_clean.csv")
monthly_mortgage_repayments_ranges.shape
# https://www.abs.gov.au/census/guide-census-data/census-dictionary/2021/variables-topic/housing/mortgage-repayments-monthly-ranges-mrerd

In [None]:
monthly_mortgage_repayments_ranges.head(5)

In [None]:
print(monthly_mortgage_repayments_ranges['monthly_mortgage_repayments_ranges'].min())
print(monthly_mortgage_repayments_ranges['monthly_mortgage_repayments_ranges'].max())

# Print the sorted unique values
print(sorted(monthly_mortgage_repayments_ranges['monthly_mortgage_repayments_ranges'].unique()))

In [None]:
weekly_rent_range = pd.read_csv("../data/curated/sa2_dataset/C21_G40_SA2_clean.csv")
weekly_rent_range.shape

# https://www.abs.gov.au/census/guide-census-data/census-dictionary/2021/variables-topic/housing/rent-weekly-ranges-rntrd

In [None]:
weekly_rent_range.head(5)

In [None]:
print(weekly_rent_range['weekly_rent_range'].min())
print(weekly_rent_range['weekly_rent_range'].max())

# Print the sorted unique values
print(sorted(weekly_rent_range['weekly_rent_range'].unique()))

Conclusion: External datasets have categories instead of numeric values. No outlier analysis needed.