# Join customer/merchant/transaction data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests
import os

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Merge")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "16G")
    .config("spark.executor.memory", "16G")
    .config("spark.dynamicAllocation.maxExecutors","8")
    .config("spark.sql.files.maxPartitionBytes", "64MB")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

In [None]:
# Load in datasets
# Load in merchant data (parquet)
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Load in merchant fraud (csv)
merchant_fp = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")
merchant_fp = spark.createDataFrame(merchant_fp)

# Load in consumer list (csv)
consumer_cid = pd.read_csv("../data/tables/part_1/tbl_consumer.csv", delimiter="|")
consumer_cid = spark.createDataFrame(consumer_cid)

# Load in consumer fraud (csv)
consumer_fp = pd.read_csv("../data/tables/part_1/consumer_fraud_probability.csv")
consumer_fp = spark.createDataFrame(consumer_fp)

consumer_ud = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")

**Join customer data**

`tbl_consumer` to `consumer_user_detail`

In [None]:
# Joining user id to customers
consumer_cid = consumer_cid.withColumn("postcode", consumer_cid.postcode.cast('string')) # cast postcode to string
consumer = consumer_cid.join(consumer_ud, on = "consumer_id", how = 'left')
consumer_list = consumer.selectExpr("user_id", "cast(postcode as string) postcode",)
consumer.show(5)

**Join customers and transaction data**

In [None]:
# Read transaction dataset
transaction1 = spark.read.parquet("../data/tables/part_2")
transaction2 = spark.read.parquet("../data/tables/part_3")
transaction3 = spark.read.parquet("../data/tables/part_4")

transaction = transaction1.union(transaction2).union(transaction3)
transaction.show(5)

In [None]:
# Join customers to transactions
transaction_consumer = transaction.join(consumer_list, on='user_id', how='left')
transaction_consumer.show(5)

In [None]:
consumer_no_transaction = consumer_list.join(transaction, on='user_id', how='left_anti')
print(f"Number of consumers that have not made a transaction: {consumer_no_transaction.count():,}")

## Joining customer transaction to merchant 

In [None]:
# Add consumer fraud to transactions
final_df = transaction_consumer.join(consumer_fp, on =['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_fraud')
no_fraud = final_df.filter(F.col("consumer_fraud").isNull()).count()
print(f"Number of transactions with no consumer fraud: {no_fraud:,}")

# Add merchant fraud to transactions by merchant and date
final_df = final_df.join(merchant_fp, on=['merchant_abn','order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_fraud')
no_fraud = final_df.filter(F.col("merchant_fraud").isNull()).count()
print(f"Number of transactions with no merchant fraud: {no_fraud:,}")

# Impute all null fraud probabilities as 0
final_df = final_df.fillna(0, subset=['merchant_fraud', 'consumer_fraud'])
no_fraud = final_df.filter((final_df["consumer_fraud"]==0) & (final_df["merchant_fraud"]==0)).count()
print(f"Number of transactions with no merchant fraud or consumer fraud: {no_fraud:,}")

# final_df.show(5) # hidden to prevent crash

In [9]:
# Repartition the DataFrame to increase the number of tasks (partitions)
# final_df.repartition(200).write.mode('overwrite').option("maxRecordsPerFile", 50000).parquet('../data/curated/fraud_watch/')
#TODO: ERROR NEEDS FIXING :( CANNOT SAVE CUSTOMERS-TRANSACTIONS/FINAL_DF FILE

# Join external datasets
Here, we estimate weekly disposible income based on the difference between total_personal_income and the average spent on rent or morgage repayments per week. The calculation uses weekly variables as follows.
$$\text{weekly disposible income} = \text{total personal income} - (\text{median rent} \times \text{proportion of renters}) - (\text{median morgage repayment} \times \text{proportion of mortgage holders})$$

In [None]:
# Download housing data
file_path = '../data/tables/sa2_dataset/main/C21_G37_SA2.csv'
url = "https://api.data.abs.gov.au/data/C21_G37_SA2/1+2+R_T+_T...SA2..2021.?detail=full"
headers = {'accept': 'text/csv'}

# Download file
response = requests.get(url, headers=headers, stream=True)

with open(file_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=8192):
        if chunk:
            file.write(chunk)

# Read in data and cast types
data = pd.read_csv(file_path, dtype={"REGION": object})

variables = {"R_T":'renting',
             "2":'owned_mortgage',
             "1": 'owned_outright',
             "_T":'total_responses',
             "REGION": 'sa2_code'}

# Aggregate to ignore the 'dwelling type' feature
tenure_data = data.groupby(['REGION', 'TENLLD']).agg('sum').reset_index()[['REGION', 'TENLLD', 'OBS_VALUE']]
tenure_data = tenure_data.pivot(index='REGION', columns='TENLLD', values='OBS_VALUE').reset_index()
tenure_data = tenure_data.rename(variables, axis=1)
tenure_data.head(5)

In [None]:
# Apply calculations
tenure_data['percent_mortgage'] = tenure_data['owned_mortgage'] / tenure_data['total_responses']
tenure_data['percent_rent'] = tenure_data['renting'] / tenure_data['total_responses']

# Investibate number of records with missing data
zero_responses = tenure_data[tenure_data.isna().any(axis=1)]
print('Number of regions with no reponses for housing section: ', len(zero_responses))
zero_responses.head(5)

# Handle missing null values by setting to zero
percentage_tenure = tenure_data.fillna(0, axis=1).iloc[:,[0, -1, -2]]

In [None]:
# Read in dataset with median statistics
variables = {1: "median_age", 
             2: "median_total_personal_income",
             3: "median_total_family_income",
             4: "median_total_household_income",
             5: "median_mortgage_repayment",
             6: "median_rent",
             7: "avg_people_per_bedroom",
             8: "avg_household_size"}

# Read in data
medians = pd.read_csv("../data/curated/sa2_dataset/C21_G02_SA2_clean.csv")

# Restructure table
medians = medians.pivot(index='sa2_code', columns=['type_of_value_code'], values='obs_value').reset_index().rename(columns=variables)
medians.columns.name = None
medians['sa2_code'] = medians.sa2_code.astype(str)
medians.head(3)

Before joining housing data to this table of medians, we investigate the records and notice that there are records that have null median summaries.

In [None]:
# Read in list of SA2 codes and associated names
col_types = {"POSTCODE": str, "SA2_CODE_2021":str, "RATIO_FROM_TO": float}
sa2_names = pd.read_excel("../data/tables/correspondence/CG_POSTCODE_2021_SA2_2021.xlsx", converters=col_types)[['SA2_CODE_2021', 'SA2_NAME_2021', 'POSTCODE']]

# Find records with null columns
null_regions = medians[medians.isna().any(axis=1)]
null_regions = null_regions.merge(sa2_names, left_on='sa2_code', right_on='SA2_CODE_2021')

# Show the name of regions associated with null values 
null_regions.iloc[:,-2:]

We see that the external dataset is only missing values for SA2 regions marked as having "no usual address", and this is ok since they are special purpose codes that don't correspond with a postcode. We can avoid any joining issues by using inner joins.

In [None]:
WEEKS_IN_MONTH = 4.345

abs_df = medians.merge(percentage_tenure, on='sa2_code')

# Calculate average weekly spending on housing per SA2 zone
# note: monthly mortgage repayment converted to weekly by dividing by # weeks in a month
abs_df['avg_housing_weekly'] = abs_df.median_rent*abs_df.percent_rent + abs_df.median_mortgage_repayment*(abs_df.percent_mortgage/WEEKS_IN_MONTH)
abs_df['weekly_personal_disposable'] = abs_df.median_total_personal_income - abs_df.avg_housing_weekly

# Associate SA2 zones to postcodes using "correspondence.parquet"
abs_df = abs_df.merge(sa2_names, left_on='sa2_code', right_on='SA2_CODE_2021').drop('SA2_CODE_2021', axis=1)

# Rename columns for consistency
abs_df.rename({"SA2_NAME_2021":'sa2_name', "POSTCODE":'postcode'}, axis=1, inplace=True)
abs_df.head(5)

In [15]:
# Save to file
abs_df.to_parquet('../data/curated/sa2_dataset/abs_medians.parquet')

# Join ABS and customer data

In [None]:
# Read in abs demographic data and customer/transaction data
abs_df = spark.read.parquet("../data/curated/sa2_dataset/abs_medians.parquet")

customer_details_abs = final_df.join(abs_df, on='postcode')
customer_details_abs.repartition(200).write.mode('overwrite').parquet('../data/curated/all_details/')