In [1]:
from pyspark.sql import SparkSession, Window, functions as F

In [3]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Debug_etl")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [14]:
master = spark.read.parquet('../../../data/curated/process_data.parquet').select(['merchant_abn', 'order_datetime', 'user_id', 'merchant_fraud_probability', 'user_fraud_probability'])

In [5]:
consumer = spark.read.csv('../../../data/tables/consumer_fraud_probability.csv', inferSchema=True, header=True)

In [6]:
merchant = spark.read.csv('../../../data/tables/merchant_fraud_probability.csv', inferSchema=True, header=True)

Check distinct counts in consumer fraud probability (user_id + order_datetime) csv

In [8]:
#consumer.printSchema()
consumer.select(['user_id', 'order_datetime']).distinct().count()

                                                                                

34765

In the consumer fraud probabilites dataset we have:
* 34765 distinct user_id/order_datetime combinations

In [9]:
consumer.select(['user_id', 'order_datetime']).distinct().show(10)

+-------+--------------+
|user_id|order_datetime|
+-------+--------------+
|   5515|    2021-10-09|
|   7137|    2021-09-06|
|  16142|    2021-11-12|
|  21664|    2021-12-25|
|   7745|    2021-10-08|
|   4790|    2021-10-13|
|  19806|    2021-09-23|
|   7790|    2021-11-01|
|  15718|    2021-05-07|
|  17456|    2021-11-15|
+-------+--------------+
only showing top 10 rows



Do the same for merchant fraud probability (merchant_abn + order_datetime)

In [11]:
#merchant.printSchema()
merchant.select(['merchant_abn', 'order_datetime']).distinct().count()

                                                                                

114

In the merchant fraud probability set we have:
* 114 distinct merchant_abn/order_datetime combinations

In [12]:
merchant.select(['merchant_abn', 'order_datetime']).distinct().show(10)

+------------+--------------+
|merchant_abn|order_datetime|
+------------+--------------+
| 31334588839|    2021-12-26|
| 27093785141|    2021-11-27|
| 31334588839|    2021-10-02|
| 80518954462|    2021-11-27|
| 79827781481|    2021-11-29|
| 11470993597|    2021-09-28|
| 23686790459|    2021-12-10|
| 19492220327|    2021-11-18|
| 90918180829|    2022-02-05|
| 31334588839|    2021-12-10|
+------------+--------------+
only showing top 10 rows



Compare with master when:
* Ufp is not null
* Mfp is not null

Get the processed_data where there are fraud probabilities

In [15]:
# get the master data set with probabilities
ufp_exists = master.na.drop(subset=['user_fraud_probability'])
mfp_exists = master.na.drop(subset=['merchant_fraud_probability'])


In [16]:
ufp_exists.select(['user_id', 'order_datetime']).distinct().count()

                                                                                

7097756

From the set, where user_fraud_probability is not null we have:
* 7,097,756 unique user_id/order_datetime combinations

How can this be possible when we only have ~34,000 distinct user_id/order_datetime combinations from the consumer fraud file?

In [17]:
master.count()

                                                                                

13614156

In [25]:
ufp_exists.orderBy(F.asc_nulls_first('user_fraud_probability')).show(20)



+------------+--------------+-------+--------------------------+----------------------+
|merchant_abn|order_datetime|user_id|merchant_fraud_probability|user_fraud_probability|
+------------+--------------+-------+--------------------------+----------------------+
| 92773401740|    2021-10-17|   2780|                      null|                  0.08|
| 70172340121|    2022-08-19|  13529|                      null|                  0.08|
| 57757792876|    2021-11-08|   4711|                      null|                  0.08|
| 24852446429|    2021-11-08|   4711|                      null|                  0.08|
| 72472909171|    2021-11-03|   6245|                      null|                  0.08|
| 46987545043|    2021-11-03|   6245|                      null|                  0.08|
| 48465277903|    2021-11-03|   6245|                      null|                  0.08|
| 83412691377|    2022-03-22|  13529|                      null|                  0.08|
| 11944993446|    2022-10-17|  1

                                                                                

Investigate merchant fraud

In [19]:
mfp_exists.select(['merchant_abn', 'order_datetime']).distinct().count()

                                                                                

16983

This shouldn't be 16983 -> it should only be 114?

In [20]:
mfp_exists.orderBy(F.asc_nulls_first('merchant_fraud_probability')).show(10)



+------------+--------------+-------+--------------------------+----------------------+
|merchant_abn|order_datetime|user_id|merchant_fraud_probability|user_fraud_probability|
+------------+--------------+-------+--------------------------+----------------------+
| 67264251405|    2021-10-19|  22239|                      0.18|                  0.95|
| 83199298021|    2021-12-14|   5450|                      0.23|                  0.21|
| 83199298021|    2021-12-14|   1050|                      0.23|                  0.29|
| 83199298021|    2021-12-14|    587|                      0.23|                  0.66|
| 83199298021|    2022-01-04|   5554|                      0.24|                  0.74|
| 83199298021|    2022-01-04|   6509|                      0.24|                  0.39|
| 83199298021|    2021-12-30|  15790|                      0.24|                  0.72|
| 83199298021|    2021-12-30|  15496|                      0.24|                   0.2|
| 19492220327|    2022-01-29|  1

                                                                                

In [21]:
mfp_exists.select(['merchant_abn', 'order_datetime']).distinct().show(10)



+------------+--------------+
|merchant_abn|order_datetime|
+------------+--------------+
| 93558142492|    2021-12-27|
| 94493496784|    2022-09-27|
| 48534649627|    2021-12-26|
| 94493496784|    2021-09-20|
| 94493496784|    2022-01-10|
| 94493496784|    2021-07-11|
| 79827781481|    2022-02-02|
| 18158387243|    2022-10-06|
| 21439773999|    2021-05-17|
| 80518954462|    2021-08-13|
+------------+--------------+
only showing top 10 rows



                                                                                

Investigate abn for 18158387243 -> example to understand what is going on

In [22]:
# e.g. merchant abn = 18158387243

merchant.where(merchant['merchant_abn'] == 18158387243).show(10)

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 18158387243|    2021-11-26|29.102418916409913|
| 18158387243|    2021-11-29|28.956947892226466|
+------------+--------------+------------------+



We have two dates in the merchant file

In [23]:
master.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- user_id: long (nullable = true)
 |-- merchant_fraud_probability: double (nullable = true)
 |-- user_fraud_probability: double (nullable = true)



In [24]:
master.where(master['merchant_abn'] == 18158387243).show(10)

+------------+--------------+-------+--------------------------+----------------------+
|merchant_abn|order_datetime|user_id|merchant_fraud_probability|user_fraud_probability|
+------------+--------------+-------+--------------------------+----------------------+
| 18158387243|    2021-05-18|    183|        0.2899999999999993|                 0.098|
| 18158387243|    2022-07-28|    837|        0.2899999999999993|   0.10333333333333333|
| 18158387243|    2022-02-02|   1032|        0.2899999999999993|                  null|
| 18158387243|    2021-05-11|   1142|        0.2899999999999993|                  null|
| 18158387243|    2021-09-14|   1142|        0.2899999999999993|                  null|
| 18158387243|    2022-09-06|   1634|        0.2899999999999993|                   0.1|
| 18158387243|    2021-12-10|   2375|        0.2899999999999993|    0.1366666666666667|
| 18158387243|    2022-06-07|   2375|        0.2899999999999993|    0.1366666666666667|
| 18158387243|    2022-02-08|   

We have multiple dates that aren't included in the merchant fraud file but have merchant fraud probabilities.

Something is wrong with the join...

Datetime is being ignored it looks like

Investigate user: 12323

In [26]:
# e.g. merchant abn = 18158387243

consumer.where(consumer['user_id'] == 12323).show(10)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|  12323|    2022-01-29|34.76753477952562|
|  12323|    2021-11-30|9.217917220764651|
+-------+--------------+-----------------+



We have 2 dates

In [30]:
master.where(master['user_id'] == 12323).show(10)



+------------+--------------+-------+--------------------------+----------------------+
|merchant_abn|order_datetime|user_id|merchant_fraud_probability|user_fraud_probability|
+------------+--------------+-------+--------------------------+----------------------+
| 48723861274|    2022-02-20|  12323|                      null|    0.2633333333333333|
| 10648956813|    2021-03-19|  12323|                      null|    0.2633333333333333|
| 10648956813|    2022-02-14|  12323|                      null|    0.2633333333333333|
| 10648956813|    2021-05-27|  12323|                      null|    0.2633333333333333|
| 18303734841|    2021-11-28|  12323|                      null|    0.2633333333333333|
| 45899477665|    2022-06-17|  12323|                      null|    0.2633333333333333|
| 90568944804|    2021-04-03|  12323|       0.30671428571428505|    0.2633333333333333|
| 46877655150|    2022-06-27|  12323|                      null|    0.2633333333333333|
| 45480771376|    2021-10-24|  1

                                                                                

Again, we've got lots more dates than in the user fraud probability file, yet they have fraud probabilities