# Join customer/merchant/transaction data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import geopandas as gpd

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Merge")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [None]:
# Load in datasets
# Load in merchant data (parquet)
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Load in merchant fraud (csv)
merchant_fp = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")
merchant_fp = spark.createDataFrame(merchant_fp)

# Load in consumer list (csv)
consumer_cid = pd.read_csv("../data/tables/part_1/tbl_consumer.csv", delimiter="|")
consumer_cid = spark.createDataFrame(consumer_cid)

# Load in consumer fraud (csv)
consumer_fp = pd.read_csv("../data/tables/part_1/consumer_fraud_probability.csv")
consumer_fp = spark.createDataFrame(consumer_fp)

consumer_ud = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")

**Join customer data**

`tbl_consumer` to `consumer_user_detail`

In [None]:
# Joining user id to customers
consumer = consumer_cid.join(consumer_ud, on = "consumer_id", how = 'left')
consumer_list = consumer.select('user_id', 'postcode')
consumer.show(5)

**Join customers and transaction data**

In [None]:
# Read transaction dataset
transaction1 = spark.read.parquet("../data/tables/part_2")
transaction2 = spark.read.parquet("../data/tables/part_3")
transaction3 = spark.read.parquet("../data/tables/part_4")

transaction = transaction1.union(transaction2).union(transaction3)
transaction.show(5)

In [None]:
# Join customers to transactions
transaction_consumer = transaction.join(consumer_list, on='user_id', how='left')
transaction_consumer.show(5)

In [None]:
consumer_no_transaction = consumer_list.join(transaction, on='user_id', how='left_anti')
print(f"Number of consumers that have not made a transaction: {consumer_no_transaction.count():,}")

## Joining customer transaction to merchant 

In [None]:
# Add consumer fraud to transactions
final_df = transaction_consumer.join(consumer_fp, on =['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_fraud')
no_fraud = final_df.filter(col("consumer_fraud").isNull()).count()
print(f"Number of transactions with no consumer fraud: {no_fraud:,}")

# Add merchant fraud to transactions by merchant and date
final_df = final_df.join(merchant_fp, on=['merchant_abn','order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_fraud')
no_fraud = final_df.filter(col("merchant_fraud").isNull()).count()
print(f"Number of transactions with no merchant fraud: {no_fraud:,}")

# Impute all null fraud probabilities as 0
final_df = final_df.fillna(0, subset=['merchant_fraud', 'consumer_fraud'])
no_fraud = final_df.filter((final_df["consumer_fraud"]==0) & (final_df["merchant_fraud"]==0)).count()
print(f"Number of transactions with no merchant fraud or consumer fraud: {no_fraud:,}")

final_df.show(5)

In [None]:
final_df.write.mode('overwrite').parquet('../data/curated/fraud_watch/')

# Join external datasets

In [11]:
medians = pd.read_csv("../data/curated/sa2_dataset/C21_G02_SA2_clean.csv")

In [None]:
# Column names
variables = {1: "median_age", 
             2: "median_total_personal_income",
             3: "median_total_family_income",
             4: "median_total_household_income",
             5: "median_mortgage_repayment",
             6: "median_rent",
             7: "avg_people_per_bedroom",
             8: "avg_household_size"}

medians = medians.pivot(index='sa2_code', columns=['type_of_value_code'], values='obs_value').reset_index().rename(columns=variables)
medians.columns.name = None
medians['sa2_code'] = medians.sa2_code.astype(str)

medians.head(5)

In [None]:
# Find records with null statistics to identify SA2 zones with null median/average values
null_regions = medians[medians.isna().any(axis=1)]
null_regions = null_regions.merge(sa2_names, left_on='sa2_code', right_on='sa2_code21') # TODO: correct this by using files in correspondence
null_regions.iloc[:,-2:]