# Fraud detection (Prelim analysis)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
from functools import reduce

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import countDistinct, col


In [2]:
sp = SparkSession.builder.appName("Fraud detection").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/03 23:47:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
sp

In [3]:
def merge_folder(trans_group: str):
    """
    Function to merge everything within yellow or green or fhvhv
    """
    dir = "../data/tables/" + trans_group +"/"
    folder_locs = os.listdir(dir)

    group_list = []
    for folder in folder_locs:
        path = dir + "/" + folder
        if os.path.isdir(path):
            # print("At current path", path)
            group_list.append(sp.read.parquet(path))

    return reduce(DataFrame.unionAll, group_list)

In [4]:
dir = "../data/tables/"
groups = ["transactions_20210228_20210827_snapshot/", "transactions_20210828_20220227_snapshot/", "transactions_20220228_20220828_snapshot/"]

final_list = []
for g in groups:
    print("Started group: ", g)
    final_list.append(sp.read.parquet(dir + g))

transactions = reduce(DataFrame.unionAll, final_list)

Started group:  transactions_20210228_20210827_snapshot/


                                                                                

Started group:  transactions_20210828_20220227_snapshot/


                                                                                

Started group:  transactions_20220228_20220828_snapshot/


                                                                                

In [5]:
transactions.count()

                                                                                

14195505

In [6]:
# To make it faster to visualize
sample = transactions.sample(0.01)
sample.show(5)
sample.count()

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|     28| 77679081913| 23.22284906762641|8b054e4e-feb5-42a...|    2021-08-20|
|  18548| 81219314324|38.951008117009515|24384170-104c-41d...|    2021-08-20|
|  18557| 32361057556| 156.5087214804617|133074c4-f0f0-451...|    2021-08-20|
|     83| 43186523025| 27.15071417863618|d2a61e9c-9c6d-4c4...|    2021-08-20|
|  18580| 40555823280|42.962360805817724|0f6d49ef-0741-46f...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



                                                                                

141612

In [5]:
cFraud = sp.read.option("header", True).csv("../data/tables/consumer_fraud_probability.csv")
# cFraud.show(3)
# cFraud.count()

In [8]:
mFraud = sp.read.option("header", True).csv("../data/tables/merchant_fraud_probability.csv")
# mFraud.show(3)
# mFraud.count()

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 19492220327|    2021-11-28|44.403658647495355|
| 31334588839|    2021-10-02| 42.75530083865367|
| 19492220327|    2021-12-22|38.867790051131095|
+------------+--------------+------------------+
only showing top 3 rows



114

In [None]:
customer_fraud = pd.read_csv("../data/tables/consumer_fraud_probability.csv")
customer_fraud.head()

In [None]:
sample.select("user_id").distinct().count()

In [None]:
customer_fraud.count()

In [None]:
merchant_fraud = pd.read_csv("../data/tables/merchant_fraud_probability.csv")
merchant_fraud.head()

In [None]:
merchant_fraud.count()

In [9]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [10]:
joined = cFraud.join(transactions, 
                    ["user_id", "order_datetime"])

In [None]:
joined.count()

In [None]:
pie_chart = joined.groupBy("user_id").count().withColumnRenamed("count", "Num Trans").groupBy("Num Trans").count().toPandas()

In [None]:
pie_chart.sort_values(by='Num Trans', inplace=True)

In [None]:
plt.pie(x=pie_chart["count"], labels=pie_chart["Num Trans"])
plt.show()

In [None]:
cFraud.select(countDistinct('order_datetime')).show()

# Transactions with unidentified merchants

In [6]:
merch = sp.read.parquet("../data/tables/tbl_merchants.parquet")
merch.printSchema()

root
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



In [12]:
print("Distinct merchants in the merchand fraud data:")
mFraud.select(countDistinct('merchant_abn')).show()

print("Distinct merchants in the transactions data:")
transactions.select(countDistinct('merchant_abn')).show()

print("Distinct merchants in merchant table:")
merch.select(countDistinct('merchant_abn')).show()

Distinct merchants in the merchand fraud data:
+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                          61|
+----------------------------+

Distinct merchants in the transactions data:


                                                                                

+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                        4422|
+----------------------------+

Distinct merchants in merchant table:
+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                        4026|
+----------------------------+



In [13]:
diff = transactions.select(col('merchant_abn')).subtract(merch.select(col('merchant_abn')))
diff.count()

                                                                                

396

In [25]:
diff.printSchema()

root
 |-- merchant_abn: long (nullable = true)



In [14]:
diff_list = diff.rdd.map(lambda x: x.merchant_abn).collect()
diff_df = transactions[transactions.merchant_abn.isin(diff_list)]

                                                                                

In [15]:
# transactions with unidentified merchants
print(diff_df.count())
diff_df.show(3)

                                                                                

580830
+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|      8| 29566626791| 74.15732460440282|71a81652-cc91-4bf...|    2021-08-20|
|  18490| 32234779638|107.14809429376949|20149572-a55b-41f...|    2021-08-20|
|     20| 67202032418| 55.46394975814555|a29071b4-29b3-4f2...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 3 rows



In [16]:
merch.filter(col("merchant_abn") == "29566626791").show(truncate=False)

+----+----+------------+
|name|tags|merchant_abn|
+----+----+------------+
+----+----+------------+



In [17]:
missing_abn = cFraud.join(diff_df, 
                    ["user_id", "order_datetime"])
missing_abn.count()

                                                                                

8744

In [18]:
missing = missing_abn.toPandas()
missing.head(5)

                                                                                

Unnamed: 0,user_id,order_datetime,fraud_probability,merchant_abn,dollar_value,order_id
0,3116,2021-08-20,8.80907072005723,29323795999,165.655935,138b20f4-41b9-4334-b14a-f938dff2fdb2
1,6690,2021-08-20,13.47586739405831,11240426404,42.037352,c801e4d9-ce81-4de3-92a7-dae7c5f3217c
2,7495,2021-08-20,12.103125937017689,34967436738,6593.43619,1b569d4c-6dda-4ca3-9309-030342eb4562
3,15466,2021-08-20,14.343889426678276,20206252973,8780.885221,582518f3-a5e2-48d0-b708-b1c3db7b5240
4,15625,2021-08-20,8.87757203937859,74648589246,1851.188954,bd9755f3-ef05-4d4a-9bf3-9459fed396ab


In [19]:
sns.set_style("darkgrid")
sns.set_palette("autumn")

sns.displot(missing["fraud_probability"])

<seaborn.axisgrid.FacetGrid at 0x7fab60e9ef40>

Error in callback <function flush_figures at 0x7fab62931af0> (for post_execute):


KeyboardInterrupt: 

In [20]:
missing["dollar_value"] = missing["dollar_value"].apply(np.floor)

missing["fraud_probability"] = missing["fraud_probability"].apply(pd.to_numeric)
missing["fraud_probability"] = missing["fraud_probability"].apply(np.ceil)

In [21]:
missing[missing["dollar_value"] == 157.0]

Unnamed: 0,user_id,order_datetime,fraud_probability,merchant_abn,dollar_value,order_id
202,3972,2021-07-22,9.0,74648589246,157.0,f6fe567a-ff22-4b14-9823-bc65e2144f24
1764,14070,2021-12-26,45.0,63966446164,157.0,720835bd-a4ec-4a68-926a-002b108a686a
3100,14530,2021-11-11,20.0,20562405782,157.0,7e23577e-418d-48d2-b631-a2e49312e869
4276,4935,2021-11-07,42.0,93360876880,157.0,6d417dbe-ad4f-4673-b3b5-a195ca0127a0


In [None]:
print("Max dollar value: ", max(missing.dollar_value))
print("Min dollar value: ", min(missing.dollar_value))

# Transactions with legit marchants but unidentified customers

In [7]:
cust = sp.read.option("header", True).option("delimiter", "|").csv("../data/tables/tbl_consumer.csv")
cust.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- consumer_id: string (nullable = true)



In [26]:
# transactions with legit merchants
sus = transactions[transactions.merchant_abn.isin(diff_list) == False]
sus.count()

                                                                                

13614675

In [24]:
# total transactions with unidentified customers
cust_diff = transactions.select(col('user_id')).subtract(cust.select(col('consumer_id')))
cust_diff.count()

                                                                                

16030

In [28]:
cust_diff.show(3)



+-------+
|user_id|
+-------+
|  18556|
|  18574|
|  18634|
+-------+
only showing top 3 rows



                                                                                

In [30]:
transactions.filter(col("user_id") == "18556").show(truncate=False)

+-------+------------+------------------+------------------------------------+--------------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|
+-------+------------+------------------+------------------------------------+--------------+
|18556  |35424691626 |19.21701380329463 |94bd1ad3-cfff-4b54-87ef-b6776f67d576|2021-08-20    |
|18556  |17060583346 |53.24328160024506 |7f0a24f0-874b-4f3f-bc98-e0703ed5df49|2021-08-20    |
|18556  |94493496784 |43.389284533931544|98148779-3e44-4773-8f62-d91701e2672f|2021-08-20    |
|18556  |58476363584 |42.50663992381587 |1c415b1b-c568-45c4-b85d-93b9b50255fc|2021-08-14    |
|18556  |70038202360 |6.913717324864915 |a3df2318-d349-43e5-94f2-89d4a0c6dce7|2021-08-15    |
|18556  |91923722701 |20.296505638610977|66eae238-5ef5-4d4c-ae5a-dfcb37a32f77|2021-07-15    |
|18556  |86182863634 |472.4435638205792 |81de9685-de50-480f-8f89-00fa27978671|2021-08-27    |
|18556  |87639550697 |15.096006399854591|305ada32-1513-4877-

In [31]:
fraud_cust_missing = cFraud.select(col('user_id')).subtract(cust.select(col('consumer_id')))
fraud_cust_missing.count()

13408

In [34]:
fraud_cust_missing.show(5)

+-------+
|user_id|
+-------+
|   7252|
|   1512|
|   9586|
|  21248|
|  17401|
+-------+
only showing top 5 rows



In [32]:
fraud_cust_missing2 = cFraud.join(cust, cFraud.user_id == cust.consumer_id).drop("consumer_id")
fraud_cust_missing2.count()

11605

In [37]:
fraud_cust_missing2.where(col("user_id")==21248).show()

+-------+--------------+-----------------+----+-------+-----+--------+------+
|user_id|order_datetime|fraud_probability|name|address|state|postcode|gender|
+-------+--------------+-----------------+----+-------+-----+--------+------+
+-------+--------------+-----------------+----+-------+-----+--------+------+



In [43]:
dlist = cust.rdd.map(lambda x: x.consumer_id).collect()

                                                                                

In [46]:
# transactions with registered merchants and unregistered customers
new = sus.select(col('user_id')).subtract(cust.select(col('consumer_id')))
new.count()

                                                                                

16030

In [48]:
new.select("user_id").distinct().count()

                                                                                

16030

In [50]:
# transactions w unregistered customers but no fraud data
new.select(col('user_id')).subtract(cFraud.select(col('user_id'))).count()

                                                                                

2622

In [None]:
# analyse the fraud probabilities of transactions with unregistered customers
joined2 = cFraud.join(new, cFraud.user_id == cust.consumer_id).drop("consumer_id")

# Feature: Unregistered customers

In [13]:
def unregistered_customers(merchants, customers, transactions):
    # list of registered merchant ABNs
    abn_list = merchants.rdd.map(lambda x: x.merchant_abn).collect()

    # transactions with registered merchant ABNs
    reg_merchant_trans = transactions[transactions.merchant_abn.isin(abn_list)]

    # total transactions with unidentified customers
    unknown_cust = (transactions.select('user_id').distinct()) \
                    .subtract(cust.select(col('consumer_id')))
    unknown_cust_list = unknown_cust.rdd.map(lambda x: x.user_id).collect()

    # transactions with registered merchant ABNs but unknown customer IDs
    return reg_merchant_trans[reg_merchant_trans.user_id.isin(unknown_cust_list)]

def create_columns(unknown_cust_trans, merch):
    trans_count = unknown_cust_trans.groupBy("merchant_abn").count() \
                    .withColumnRenamed("count", "unknown_users_trans")

    users_count = unknown_cust_trans.groupBy("merchant_abn") \
                    .agg(countDistinct("user_id")) \
                    .withColumnRenamed("count(user_id)", "unknown_users_count")

    merch = merch.join(trans_count, ["merchant_abn"])
    merch = merch.join(users_count, ["merchant_abn"])
    return merch

In [14]:
unreg_custs = unregistered_customers(merch, cust, transactions, cFraud)

                                                                                

In [15]:
unreg_custs.count()

                                                                                

9061903

In [16]:
final = create_columns(unreg_custs, merch)

In [18]:
final.show(5)

22/10/04 00:00:06 WARN DAGScheduler: Broadcasting large task binary with size 1205.5 KiB
22/10/04 00:00:07 WARN DAGScheduler: Broadcasting large task binary with size 1200.3 KiB




22/10/04 00:00:20 WARN DAGScheduler: Broadcasting large task binary with size 1202.9 KiB


[Stage 44:>                                                         (0 + 8) / 9]

22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:00:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/04 00:00:28 WARN DAGScheduler: Broadcasting large task binary with size 1198.3 KiB


                                                                                

22/10/04 00:00:29 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
+------------+--------------------+--------------------+-------------------+-------------------+
|merchant_abn|                name|                tags|unknown_users_trans|unknown_users_count|
+------------+--------------------+--------------------+-------------------+-------------------+
| 38700038932|Etiam Bibendum In...|[(tent and awning...|               4715|               4090|
| 19839532017|Pellentesque Habi...|([cable, Satellit...|                494|                484|
| 35344855546|Quis Tristique Ac...|[(watch, clock, a...|                984|                957|
| 83412691377|Suspendisse Sagit...|([watch, clock, a...|               9408|               7146|
| 15613631617|     Ante Industries|[[motor vehicle s...|               1211|               1162|
+------------+--------------------+--------------------+-------------------+-------------------+
only showing top 5 rows

