# Merge datasets

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import geopandas as gpd

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import geopandas as gpd
import requests
import os

# Join merchant & consumer data

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data Merge")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

# Join merchant & consumer data

## Use clean dataset

In [None]:
# Load in merchant data (csv)
merchant = spark.read.parquet("../data/curated/part_1/tbl_merchants.parquet")

# Load in merchant fraud (csv)
merchant_fp = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")
merchant_fp = spark.createDataFrame(merchant_fp)

# Load in consumer list (csv)
consumer = pd.read_csv("../data/tables/part_1/tbl_consumer.csv", delimiter="|")
consumer = spark.createDataFrame(consumer)

# Load in consumer fraud (csv)
consumer_fp = pd.read_csv("../data/tables/part_1/consumer_fraud_probability.csv")
consumer_fp = spark.createDataFrame(consumer_fp)

consumer_ud = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")

**Join customer data**

`tbl_consumer` to `consumer_user_detail`

In [None]:
# Joining user id to customers
consumer = consumer.join(consumer_ud, on = 'consumer_id', how = 'left')
consumer.show()

In [None]:
consumer = consumer.select('user_id', 'consumer_id', 'postcode')

**Join customers and transaction data**

In [None]:
# Read transaction dataset
transaction1 = spark.read.parquet("../data/tables/part_2")
transaction2 = spark.read.parquet("../data/tables/part_3")
transaction3 = spark.read.parquet("../data/tables/part_4")

transaction = transaction1.union(transaction2).union(transaction3)

In [None]:
# Join customers to transactions
consumer_transaction = transaction.join(consumer, on='user_id', how='left')
dropped_consumer_transaction = transaction.join(consumer, on='user_id', how='left_anti') 

In [None]:
consumer_transaction.limit(5)

## Joining customer transaction to merchant 

In [None]:
final_df = consumer_transaction.join(merchant_fp, on=['merchant_abn','order_datetime'], how = 'left')
final_df.show()

In [None]:
final_df = final_df.join(consumer_fp, on =['user_id', 'order_datetime'], how = 'left')
final_df.show()

In [None]:
final_df.write.mode('overwrite').parquet('../data/curated/fraud_watch/')

# Join external datasets

In [None]:
medians = pd.read_csv("../data/curated/sa2_dataset/C21_G02_SA2_clean.csv")

In [None]:
# Column names
variables = {1: "median_age", 
             2: "median_total_personal_income",
             3: "median_total_family_income",
             4: "median_total_household_income",
             5: "median_mortgage_repayment",
             6: "median_rent",
             7: "avg_people_per_bedroom",
             8: "avg_household_size"}

medians = medians.pivot(index='sa2_code', columns=['type_of_value_code'], values='obs_value').reset_index().rename(columns=variables)
medians.columns.name = None
medians['sa2_code'] = medians.sa2_code.astype(str)

medians.head(5)

In [None]:
# Read in zone file
sa2_boundary_gdf = gpd.read_file("../data/curated/sa2_boundary/SA2_2021_AUST_GDA2020_clean.shp")
sa2_names = sa2_boundary_gdf[['sa2_code21', 'sa2_name21']].astype('string')

In [None]:
# Find records with null statistics to identify SA2 zones with null median/average values
null_regions = medians[medians.isna().any(axis=1)]
null_regions = null_regions.merge(sa2_names, left_on='sa2_code', right_on='sa2_code21')
null_regions.iloc[:,-2:]

In [None]:
# Download data
url = """https://github.com/matthewproctor/australianpostcodes/blob/92a036281ee4009be03cca3ab0b8b1a49b21dca7/australian_postcodes.csv"""
base_path = "../data/tables/poa_dataset"

response = requests.get(url, headers={'accept': 'text/csv'}, stream=True)
response.raise_for_status()

# Create directory if it doesn't exist
if not os.path.exists(base_path):
    os.makedirs(base_path)

with open("../data/tables/poa_dataset/postcodes_to_sa2.csv", 'wb') as file:
    for chunk in response.iter_content(chunk_size=8192):
        if chunk:
            file.write(chunk)

In [None]:
# get sa2 - postcodes
# write code to come up with useful statistics per 