In [1]:
# from pyspark.sql import SparkSession
# import pyspark.pandas as ps
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import RobustScaler
from sklearn import linear_model
from sklearn import metrics
import re

# spark = (
#     SparkSession.builder.appName("BNPL")
#     .config("spark.sql.repl.eagerEval.enabled", True) 
#     .config("spark.sql.parquet.cacheMetadata", "true")
#     .config("spark.driver.memory", "8g")
#     .getOrCreate()
# )

data_directory = "../data/"
tables_directory = data_directory + "tables/"
merchant_fraud_directory = tables_directory + "merchant_fraud_probability.csv"
consumer_fraud_directory = tables_directory + "consumer_fraud_probability.csv"

seed=42
random.seed(seed)
np.random.seed(seed)

In [2]:
# Getting transaction data and remove merchants not listed in transactions
def normalise_tags(merchants):
    merchants[["sector_tags", "revenue_band", "take_rate"]] = merchants.apply(lambda row: process_tags(row.tags),axis='columns', result_type='expand')
    merchants["sector_tags"] = merchants["sector_tags"].str.lower().str.replace(' +', ' ', regex=True).str.strip()
    return merchants
def process_tags(tag):
    result = re.search(r'^[\[\(]{2}(.+?(?:, ?.+)*)[\]\)], [\[\(]([a-z])[\]\)], [\(\[].+: (\d+\.?\d+)[\)\]]{2}$', tag)
    return result.group(1), result.group(2), result.group(3)

merchants = normalise_tags(pd.read_parquet("../data/tables/tbl_merchants.parquet"))
transactions_directories = ["../data/tables/transactions_20210228_20210827_snapshot"
                  ,"../data/tables/transactions_20210828_20220227_snapshot"
                  ,"../data/tables/transactions_20220228_20220828_snapshot"]
transactions = pd.concat([pd.read_parquet(trans_dir) for trans_dir in transactions_directories])
transactions = transactions[transactions["merchant_abn"].isin(merchants.index.to_numpy())]
transactions

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,order_datetime
0,1,28000487688,133.226894,0c37b3f7-c7f1-48cb-bcc7-0a58e76608ea,2021-02-28
1,18485,62191208634,79.131400,9e18b913-0465-4fd4-92fd-66d15e65d93c,2021-02-28
2,1,83690644458,30.441348,40a2ff69-ea34-4657-8429-df7ca957d6a1,2021-02-28
3,18488,39649557865,962.813341,f4c1a5ae-5b76-40d0-ae0f-cb9730ac325a,2021-02-28
4,2,80779820715,48.123977,cd09bdd6-f56d-489f-81ea-440f4bda933c,2021-02-28
...,...,...,...,...,...
6044127,11135,49891706470,24.174809,0c955d44-b77b-4cfb-be2f-b5a2d19386ef,2022-10-26
6044129,11136,24852446429,5.384811,ba2ce936-ae49-4b70-a4b1-e8f8bacaefb3,2022-10-26
6044130,11136,63685007785,48.129887,a6194c61-ae7a-4037-a425-2dd7cda42fb7,2022-10-26
6044131,11137,85502310765,4.768752,72a3639e-25d5-4bcc-9e8b-26e737b8c0e4,2022-10-26


In [3]:
merchant_fraud_prob = pd.read_csv(merchant_fraud_directory)
consumer_fraud_prob = pd.read_csv(consumer_fraud_directory)
merchant_fraud_prob["order_datetime"] = pd.to_datetime(merchant_fraud_prob["order_datetime"])
consumer_fraud_prob["order_datetime"] = pd.to_datetime(consumer_fraud_prob["order_datetime"])
transactions["order_datetime"] = pd.to_datetime(transactions["order_datetime"])

In [4]:
merchant_fraud_total = transactions.merge(merchant_fraud_prob, how='inner', on=['merchant_abn', 'order_datetime'])
consumer_fraud_total = transactions.merge(consumer_fraud_prob, how='inner', on=['user_id', 'order_datetime'])

According to consumers. First we will need to get the right data which is normalized difference from median.



In [5]:
# Now we will collate all transactions relating to those consumers and group by day per consumer
consumer_transactions_day = transactions.groupby(['user_id', 'order_datetime']).agg(total_dollar=pd.NamedAgg(column='dollar_value', aggfunc="sum")).reset_index()
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar
0,1,2021-02-28,163.668243
1,1,2021-03-02,301.677498
2,1,2021-03-04,5.904951
3,1,2021-03-06,18.135592
4,1,2021-03-09,64.670772
...,...,...,...
8756833,24081,2022-10-19,575.305082
8756834,24081,2022-10-20,705.340737
8756835,24081,2022-10-21,5.876394
8756836,24081,2022-10-24,68.147276


Now that we have grouped each transaction by days per customer, we will find all the transactions pertaining to the fraud data and then find the outliers through the IQR and record distances away from median (normalized)



In [6]:
# Median and Quantile Scaling to scale all customers transactions
consumer_transactions_day['normal_total_dollar'] = (consumer_transactions_day
                                                    .groupby('user_id')['total_dollar']
                                                    .apply(lambda x: (x-x.median())/(x.quantile(0.75)-x.quantile(0.25))))

In [7]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar
0,1,2021-02-28,163.668243,0.302918
1,1,2021-03-02,301.677498,0.969702
2,1,2021-03-04,5.904951,-0.459306
3,1,2021-03-06,18.135592,-0.400214
4,1,2021-03-09,64.670772,-0.175382
...,...,...,...,...
8756833,24081,2022-10-19,575.305082,1.838081
8756834,24081,2022-10-20,705.340737,2.362365
8756835,24081,2022-10-21,5.876394,-0.457770
8756836,24081,2022-10-24,68.147276,-0.206703


After Median and Quantile scaling, we can now see that all outliers would have a normal total dollar value of >2. Now we will build a linear model based on given fraud probabilities and scaled distance of the transactions.

In [8]:
fraud_consumer_norm_prob = consumer_transactions_day.merge(consumer_fraud_prob, how='inner', on=['user_id', 'order_datetime'])
fraud_consumer_norm_prob

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_probability
0,1,2022-02-20,2479.076338,11.489677,9.805431
1,2,2021-08-30,2132.002253,9.594187,9.599514
2,2,2021-09-25,2236.461666,10.089482,10.069851
3,3,2021-11-03,2334.493717,10.065918,8.300636
4,4,2021-10-09,2325.039260,10.299494,9.633302
...,...,...,...,...,...
32762,24079,2021-10-12,4708.648444,19.229422,14.948165
32763,24079,2021-11-08,2816.251072,11.329474,8.940524
32764,24079,2021-11-26,2784.152043,11.195474,8.838622
32765,24081,2021-10-08,4320.715449,16.939012,14.343772


In [9]:
lm = linear_model.LinearRegression()
X = fraud_consumer_norm_prob['normal_total_dollar']
y = fraud_consumer_norm_prob['fraud_probability']

In [10]:
from sklearn.model_selection import train_test_split
X = X.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

In [11]:
lm.fit(X_train,y_train)

In [12]:
lm.coef_

array([0.3404328])

In [13]:
y_pred = lm.predict(X_test)

In [14]:
metrics.r2_score(y_test, y_pred)

0.5268505911379817

In [15]:
metrics.mean_squared_error(y_test, y_pred)

39.21100713371064

In [16]:
metrics.mean_absolute_error(y_test,y_pred)

3.9038433267980843

We have a fraud model that predicts accuractly the fraud probability (+- 4.536%). Now we will find all the outliers in the transactions and give it a fraud probability, and remove it with the fraud probability chance.



In [17]:
consumer_transactions_day['fraud_prob'] = consumer_transactions_day['normal_total_dollar'].apply(lambda x: (lm.predict([[x]]))[0] if x>2 else 0.00001)
consumer_transactions_day['fraud_prob'] = consumer_transactions_day['fraud_prob']/100
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob
0,1,2021-02-28,163.668243,0.302918,1.000000e-07
1,1,2021-03-02,301.677498,0.969702,1.000000e-07
2,1,2021-03-04,5.904951,-0.459306,1.000000e-07
3,1,2021-03-06,18.135592,-0.400214,1.000000e-07
4,1,2021-03-09,64.670772,-0.175382,1.000000e-07
...,...,...,...,...,...
8756833,24081,2022-10-19,575.305082,1.838081,1.000000e-07
8756834,24081,2022-10-20,705.340737,2.362365,9.568140e-02
8756835,24081,2022-10-21,5.876394,-0.457770,1.000000e-07
8756836,24081,2022-10-24,68.147276,-0.206703,1.000000e-07


In [18]:
consumer_transactions_day['generated_prob'] = np.random.random(size=len(consumer_transactions_day))

In [19]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.302918,1.000000e-07,0.374540,
1,1,2021-03-02,301.677498,0.969702,1.000000e-07,0.950714,
2,1,2021-03-04,5.904951,-0.459306,1.000000e-07,0.731994,
3,1,2021-03-06,18.135592,-0.400214,1.000000e-07,0.598658,
4,1,2021-03-09,64.670772,-0.175382,1.000000e-07,0.156019,
...,...,...,...,...,...,...,...
8756833,24081,2022-10-19,575.305082,1.838081,1.000000e-07,0.579625,
8756834,24081,2022-10-20,705.340737,2.362365,9.568140e-02,0.837765,
8756835,24081,2022-10-21,5.876394,-0.457770,1.000000e-07,0.007949,
8756836,24081,2022-10-24,68.147276,-0.206703,1.000000e-07,0.053546,


In [20]:
consumer_transactions_day['remove'] = consumer_transactions_day['generated_prob'] < consumer_transactions_day['fraud_prob']
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.302918,1.000000e-07,0.374540,False
1,1,2021-03-02,301.677498,0.969702,1.000000e-07,0.950714,False
2,1,2021-03-04,5.904951,-0.459306,1.000000e-07,0.731994,False
3,1,2021-03-06,18.135592,-0.400214,1.000000e-07,0.598658,False
4,1,2021-03-09,64.670772,-0.175382,1.000000e-07,0.156019,False
...,...,...,...,...,...,...,...
8756833,24081,2022-10-19,575.305082,1.838081,1.000000e-07,0.579625,False
8756834,24081,2022-10-20,705.340737,2.362365,9.568140e-02,0.837765,False
8756835,24081,2022-10-21,5.876394,-0.457770,1.000000e-07,0.007949,False
8756836,24081,2022-10-24,68.147276,-0.206703,1.000000e-07,0.053546,False


In [21]:
consumer_transactions_removed_fraud = consumer_transactions_day[consumer_transactions_day['remove']==False]
consumer_transactions_removed_fraud

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.302918,1.000000e-07,0.374540,False
1,1,2021-03-02,301.677498,0.969702,1.000000e-07,0.950714,False
2,1,2021-03-04,5.904951,-0.459306,1.000000e-07,0.731994,False
3,1,2021-03-06,18.135592,-0.400214,1.000000e-07,0.598658,False
4,1,2021-03-09,64.670772,-0.175382,1.000000e-07,0.156019,False
...,...,...,...,...,...,...,...
8756833,24081,2022-10-19,575.305082,1.838081,1.000000e-07,0.579625,False
8756834,24081,2022-10-20,705.340737,2.362365,9.568140e-02,0.837765,False
8756835,24081,2022-10-21,5.876394,-0.457770,1.000000e-07,0.007949,False
8756836,24081,2022-10-24,68.147276,-0.206703,1.000000e-07,0.053546,False


We only removed 8756838 - 8660571 = 96267 rows which is 1.099% of the data. This additional removal would improve our model as we remove likelihood of fraud as much as we can so we can generate an accurate ranking system for the merchants.