Fraud Imputation and Segmenting Merchants 

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import re
import glob
import math
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from collections import defaultdict
from multiprocessing import Manager
import sklearn

In [2]:
spark = (
    SparkSession.builder.appName('Fraud Imputation')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/10/03 04:31:29 WARN Utils: Your hostname, DESKTOP-F216TKE resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/10/03 04:31:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/03 04:31:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions = spark.read.parquet(".././data/curated/transaction_external")

In [4]:
transactions.show()

24/10/03 04:31:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----+----+--------+--------------+------------+-------+-----------+------------------+--------------------+------------------+-----+-----------+-----------------+--------------------+--------------------+----+---------+--------------+----------+--------------+----------------+------------------+-------------------+------------------+-------------------------+-------------------------+----------------------+------+-------+-------+------------+------------+------------+-----------+-----------+------------+-----------+------------+--------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+--------------+
|month|year|postcode|order_datetime|merchant_abn|user_id|consumer_id|      dollar_value|            order_id|     consumer_name|state|     gender|fraud_probability|       merchant_name|                tags|type|take_rate|merchant_fraud|AREASQKM21|    SHAPE_Leng|      SHAPE_Area|               lon|                lat|Median_age_per

In [5]:
transactions.printSchema()

root
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_abn: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- consumer_id: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- consumer_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- type: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- merchant_fraud: string (nullable = true)
 |-- AREASQKM21: double (nullable = true)
 |-- SHAPE_Leng: double (nullable = true)
 |-- SHAPE_Area: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- Median_age_persons: double (nullable = true)
 |-- M

In [6]:
transactions = full_transactions.withColumn('merchant_fraud', F.col('merchant_fraud').cast(DoubleType()))

In [7]:
transactions = full_transactions.withColumn('order_time', F.unix_timestamp('order_datetime'))

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

#transactions = spark.read.parquet("../data/curated/transactions.parquet")

categories = transactions.select('tags').distinct().toPandas().dropna().astype("string")['tags'].to_list()
transactions = transactions.where(F.col("merchant_fraud_probability").isNotNull()).toPandas()

X = transactions.drop('merchant_fraud_probability', axis=1)
y = transactions['merchant_fraud_probability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

columns = list(X.columns.values)

preprocessor = ColumnTransformer(
    transformers=[
        ('tags', Pipeline([
            ('onehot', OneHotEncoder(categories=[categories], sparse_output=False, handle_unknown='ignore')),
        ]), [columns.index('tags')]), # tags
        ('state', Pipeline([
            ('onehot', OneHotEncoder(categories=[['NSW', 'VIC', 'TAS', 'QLD', 'WA', 'SA', 'NT', 'ACT']], sparse_output=False, handle_unknown='ignore')),
        ]), [columns.index('state')]), # state
        ('dollar_value', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ]), [columns.index('dollar_value')]), # dollar value
        ('take_rate', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ]), [columns.index('take_rate')]) # take rate
    ]
)

knn = KNeighborsRegressor(n_neighbors=3, weights='distance', p=1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', knn)
])

pipeline.fit(X_train, y_train)

# Display graph of predicted vs actual
plt.scatter(pipeline.predict(X_test), y_test)
print(f"RMSE: {root_mean_squared_error(y_test, pipeline.predict(X_test))}")

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.scatterplot(transactions, x='dollar_value', y='merchant_fraud_probability', alpha=0.5, color='blue')
plt.title("Merchant Fraud Against Transaction Value")
plt.ylabel('Merchant Fraud Probability (%)')
plt.xlabel("Transaction Value ($)")
# plt.legend()
plt.xlim((0, 110000))
plt.ylim((0, 95))
plt.grid()
plt.show()

print(len(X))
print(spark.read.parquet("../data/curated/transactions.parquet").count())

In [None]:
plt.figure(figsize=(12, 12))
g = sns.JointGrid(data=transactions, x='dollar_value', y='merchant_fraud_probability', height=12)

g.plot(sns.scatterplot, sns.kdeplot)

g.ax_joint.set_title("Merchant Fraud Against Transaction Value")
g.ax_joint.set_xlabel("Transaction Value ($)")
g.ax_joint.set_ylabel("Merchant Fraud Probability (%)")
g.ax_joint.set_xlim(0, 110000)
g.ax_joint.set_ylim(0, 95)
g.ax_joint.grid()

# sns.kdeplot(data=transactions, y='merchant_fraud_probability', ax=g.ax_marg_y, bw_adjust=0.5, fill=True)

g.ax_marg_x.remove()
# g.ax_marg_y.set_title("KDE of Merchant Fraud Probability")
# g.ax_marg_y.set_ylabel("Density")
# g.ax_marg_y.set_xlim(0, 95)

plt.show()

In [None]:
# sns.jointplot(data=transactions, x='dollar_value', y='merchant_fraud_probability')
g = sns.JointGrid(data=transactions, x="dollar_value", y="merchant_fraud_probability")
g.plot_joint(sns.scatterplot, color='blue', alpha=0.5)
g.plot_marginals(sns.kdeplot, color='blue', fill=True, bw_adjust=6)
g.ax_marg_x.remove()
g.ax_joint.set_xlim(0, 110000)
g.ax_joint.set_ylim(0, 95)
g.ax_joint.set_title("Merchant Fraud Against Transaction Value")
g.ax_joint.set_xlabel("Transaction Value ($)")
g.ax_joint.set_ylabel("Merchant Fraud Probability (%)")

In [None]:
pipeline.fit(X, y)
transactions = pd.read_parquet("../data/curated/transactions.parquet")

mask = transactions['merchant_fraud_probability'].isna()

transactions_to_predict = transactions.loc[mask, :]
X_to_predict = transactions_to_predict.drop('merchant_fraud_probability', axis=1)

transactions.loc[mask, 'merchant_fraud_probability'] = pipeline.predict(X_to_predict)
transactions.to_parquet("../data/curated/transactions1.parquet")

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

transactions = spark.read.parquet("../data/curated/transactions1.parquet")

categories = transactions.select('tags').distinct().toPandas().dropna().astype("string")['tags'].to_list()
transactions = transactions.where(F.col("fraud_probability").isNotNull()).toPandas()

X = transactions.drop('fraud_probability', axis=1)
y = transactions['fraud_probability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

columns = list(X.columns.values)

preprocessor = ColumnTransformer(
    transformers=[
        ('tags', Pipeline([
            ('onehot', OneHotEncoder(categories=[categories], sparse_output=False, handle_unknown='ignore')),
        ]), [columns.index('tags')]), # tags
        ('state', Pipeline([
            ('onehot', OneHotEncoder(categories=[['NSW', 'VIC', 'TAS', 'QLD', 'WA', 'SA', 'NT', 'ACT']], sparse_output=False, handle_unknown='ignore')),
        ]), [columns.index('state')]), # state
        ('dollar_value', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ]), [columns.index('dollar_value')]), # dollar value
        ('take_rate', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
        ]), [columns.index('take_rate')]) # take rate
    ]
)

knn = KNeighborsRegressor(n_neighbors=int(math.sqrt(len(transactions)) / 2), weights='distance', p=1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', knn)
])

X = transactions.drop('fraud_probability', axis=1)
y = transactions['fraud_probability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

pipeline.fit(X_train, y_train)

combined = X_test
combined['predicted'] = pipeline.predict(X_test)
combined['actual'] = y_test
plt.figure(figsize=(20, 20))
sns.scatterplot(combined, x='predicted', y='actual', hue='tags')

print(f"RMSE: {root_mean_squared_error(y_test, pipeline.predict(X_test))}")

In [None]:
# Add features to get transactions ready to pass to model
transactions = spark.read.parquet("../data/curated/transactions2.parquet")
transactions = transactions.withColumn('expected_revenue', 
                                       transactions.dollar_value
                                       * (1.0 - transactions.merchant_fraud_probability / 100.0) 
                                       * (1.0 - transactions.fraud_probability / 100.0) 
                                       * transactions.take_rate / 100.0)
transactions = transactions.withColumn('day_of_week', F.dayofweek('order_datetime'))
transactions = transactions.withColumn('day_of_month', F.dayofmonth('order_datetime'))
transactions = transactions.withColumn('day_of_year', F.dayofyear('order_datetime'))
transactions = transactions.withColumn('month', F.month('order_datetime'))

transactions = transactions.withColumn(
    "is_holiday",
    F.when(F.col("order_datetime").isin([
        '2021-11-26',
        '2021-11-27',
        '2021-11-28',
        '2021-11-29',
        '2021-12-24',
        '2021-12-25',
        '2021-12-26',
    ]), F.lit(1)).otherwise(F.lit(0))
)

transactions = transactions.withColumn(
    "seasonal_sales",
    F.when(F.col("state") == "NSW", F.col("NSW_seasonal"))
     .when(F.col("state") == "VIC", F.col("VIC_seasonal"))
     .when(F.col("state") == "QLD", F.col("QLD_seasonal"))
     .when(F.col("state") == "SA", F.col("SA_seasonal"))
     .when(F.col("state") == "WA", F.col("WA_seasonal"))
     .when(F.col("state") == "TAS", F.col("TAS_seasonal"))
     .when(F.col("state") == "NT", F.col("NT_seasonal"))
     .when(F.col("state") == "ACT", F.col("ACT_seasonal"))
     .otherwise(None)
)

transactions = transactions.withColumn(
    "original_sales",
    F.when(F.col("state") == "NSW", F.col("NSW_original"))
     .when(F.col("state") == "VIC", F.col("VIC_original"))
     .when(F.col("state") == "QLD", F.col("QLD_original"))
     .when(F.col("state") == "SA", F.col("SA_original"))
     .when(F.col("state") == "WA", F.col("WA_original"))
     .when(F.col("state") == "TAS", F.col("TAS_original"))
     .when(F.col("state") == "NT", F.col("NT_original"))
     .when(F.col("state") == "ACT", F.col("ACT_original"))
     .otherwise(None)
)

transactions = transactions.withColumnRenamed("Total_seasonal", "total_seasonal_sales")
transactions = transactions.withColumnRenamed("Total_original", "total_original_sales")
transactions = transactions.select([col for col in transactions.columns if not col.endswith('original')])
transactions = transactions.select([col for col in transactions.columns if not col.endswith('seasonal')])

transactions.write.mode('overwrite').parquet("../data/curated/transactions3.parquet")
transactions.where(F.col('order_datetime') == '2021-12-26').show(truncate=False)

In [None]:
from pyspark.sql.window import Window
from pyspark.sql import DataFrame

transactions = spark.read.parquet("../data/curated/transactions3.parquet")

def skewness_udf(column):
    return (F.mean(column) - F.median(column)) / F.stddev(column)

window_spec = Window.partitionBy("merchant_abn").orderBy("order_datetime")

agg_df = transactions.groupBy("merchant_abn").agg(
    F.expr("percentile_approx(fraud_probability, 0.5)").alias("fraud_probability_median"),
    skewness_udf(F.col("fraud_probability")).alias("fraud_probability_skewness"),
    F.expr("percentile_approx(dollar_value, 0.5)").alias("dollar_value_median"),
    F.expr("percentile_approx(expected_revenue, 0.5)").alias("expected_revenue_median"),
    skewness_udf(F.col("dollar_value")).alias("dollar_value_skewness"),
    F.count("merchant_abn").alias("record_count"),
    F.first("tags").alias("tags"),
    F.first("take_rate").alias("take_rate"),
    F.mean("merchant_fraud_probability").alias("merchant_fraud_probability"),
    F.mean("dollar_value").alias("mean_dollar_value"),
)

agg_df.show()

Segment Data According to Sales and Revenue 

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

y = 'mean_dollar_value'
df = agg_df.toPandas()
df = df.fillna(0)
df['record_count'] = np.log10(df['record_count'])
# df['dollar_value_median'] = np.log(df['dollar_value_median'])
# df['expected_revenue_median'] = np.log(df['expected_revenue_median'])
df[y] = np.log10(df[y])
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=3, random_state=0))
])

df['cluster'] = pipeline.fit_predict(df[[
    'record_count', 
    # 'merchant_fraud_probability', 
    y
]])

# fig = px.scatter(
#     df, 
#     x='record_count', 
#     y=y, 
#     color='cluster', 
#     title='log(mean dollar value) vs log(sales) by KMeans Cluster', 
#     labels={
#         'record_count': 'log(sales)',
#         'expected_revenue_median': 'log(mean dollar value)'
#     },
#     hover_data=['merchant_abn']
# )
# fig.show()

# df['record_count'] = np.exp(df['record_count'])
# df[y] = np.exp(df[y])
# sns.color_palette("tab10")
# plt.figure(figsize=(8, 6))
# sns.scatterplot(df, x='record_count', y=y, hue='cluster', palette=sns.color_palette('tab10'))
# plt.title("Merchant Segmentation")
# plt.ylabel('Log10(Average Transaction Value)')
# plt.xlabel("Log10(Number of Sales)")
# plt.legend()
# plt.grid()
# plt.show()
df.loc[df['cluster'] == 0, 'cluster'] = "Med price, Med sales"
df.loc[df['cluster'] == 1, 'cluster'] = "High price, Low sales"
df.loc[df['cluster'] == 2, 'cluster'] = "Low price, High sales"
clusters = spark.createDataFrame(df).select('merchant_abn', 'cluster')
clusters.show()


In [None]:
# df['record_count'] = np.log10(df['record_count'])
# df[y] = np.log10(df[y])
plt.figure(figsize=(8, 6))
sns.scatterplot(df, x='record_count', y=y, hue='cluster', palette=sns.color_palette(), alpha=0.5, s=30)
plt.title("Merchant Segmentation")
plt.ylabel('Log10(Average Transaction Value)')
plt.xlabel("Log10(Number of Sales)")
# plt.legend()
# plt.grid()
plt.show()

The graph above shows the data can be segmented into three caetgories based on their sales and average number of transactions

In [None]:
fig = px.scatter(
    df, 
    x='record_count', 
    y=y, 
    color='tags', 
    title='log(mean dollar value) vs log(sales) by KMeans Cluster', 
    labels={
        'record_count': 'log(sales)',
        y: 'log(mean dollar value)'
    },
    hover_data=['merchant_abn']
)

# Display plot
fig.show()