In [145]:
# from pyspark.sql import SparkSession
# import pyspark.pandas as ps
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import RobustScaler
from sklearn import linear_model
from sklearn import metrics

# spark = (
#     SparkSession.builder.appName("BNPL")
#     .config("spark.sql.repl.eagerEval.enabled", True) 
#     .config("spark.sql.parquet.cacheMetadata", "true")
#     .config("spark.driver.memory", "8g")
#     .getOrCreate()
# )

data_directory = "../data/"
tables_directory = data_directory + "tables/"
merchant_fraud_directory = tables_directory + "merchant_fraud_probability.csv"
consumer_fraud_directory = tables_directory + "consumer_fraud_probability.csv"
transactions_directory = "../data/curated/transactions.parquet"

seed=42
random.seed(seed)
np.random.seed(seed)

In [96]:
transactions = pd.read_parquet(transactions_directory)
merchant_fraud_prob = pd.read_csv(merchant_fraud_directory)
consumer_fraud_prob = pd.read_csv(consumer_fraud_directory)
#merchant_fraud_prob["order_datetime"] = pd.to_datetime(merchant_fraud_prob["order_datetime"])
#consumer_fraud_prob["order_datetime"] = pd.to_datetime(consumer_fraud_prob["order_datetime"])

In [97]:
merchant_fraud_total = transactions.merge(merchant_fraud_prob, how='inner', on=['merchant_abn', 'order_datetime'])
consumer_fraud_total = transactions.merge(consumer_fraud_prob, how='inner', on=['user_id', 'order_datetime'])

According to consumers. First we will need to get the right data which is normalized difference from median.

In [98]:
# Now we will collate all transactions relating to those consumers and group by day per consumer
consumer_transactions_day = transactions.groupby(['user_id', 'order_datetime']).agg(total_dollar=pd.NamedAgg(column='dollar_value', aggfunc="sum")).reset_index()
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar
0,1,2021-02-28,163.668243
1,1,2021-03-02,301.677498
2,1,2021-03-04,5.904951
3,1,2021-03-06,18.135592
4,1,2021-03-09,64.670772
...,...,...,...
8582343,24081,2022-10-19,575.305082
8582344,24081,2022-10-20,705.340737
8582345,24081,2022-10-21,5.876394
8582346,24081,2022-10-24,68.147276


Now that we have grouped each transaction by days per customer, we will find all the transactions pertaining to the fraud data and then find the outliers through the IQR and record distances away from median (normalized)

In [99]:
# Median and Quantile Scaling to scale all customers transactions
rs = RobustScaler()
consumer_transactions_day['normal_total_dollar'] = (consumer_transactions_day
                                                    .groupby('user_id')['total_dollar']
                                                    .apply(lambda x: (x-x.median())/(x.quantile(0.75)-x.quantile(0.25))))


In [100]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar
0,1,2021-02-28,163.668243,0.349362
1,1,2021-03-02,301.677498,1.048156
2,1,2021-03-04,5.904951,-0.449453
3,1,2021-03-06,18.135592,-0.387525
4,1,2021-03-09,64.670772,-0.151899
...,...,...,...,...
8582343,24081,2022-10-19,575.305082,2.176581
8582344,24081,2022-10-20,705.340737,2.792478
8582345,24081,2022-10-21,5.876394,-0.520443
8582346,24081,2022-10-24,68.147276,-0.225505


After Median and Quantile scaling, we can now see that all outliers would have a normal total dollar value of >2. Now we will build a linear model based on given fraud probabilities and scaled distance of the transactions. 

In [101]:
consumer_fraud_prob

Unnamed: 0,user_id,order_datetime,fraud_probability
0,6228,2021-12-19,97.629808
1,21419,2021-12-10,99.247380
2,5606,2021-10-17,84.058250
3,3101,2021-04-17,91.421921
4,22239,2021-10-19,94.703425
...,...,...,...
34859,18466,2022-02-22,8.679873
34860,17552,2021-12-26,8.349463
34861,17875,2021-06-27,8.288847
34862,10401,2021-09-18,8.842216


In [102]:
fraud_consumer_norm_prob = consumer_transactions_day.merge(consumer_fraud_prob, how='inner', on=['user_id', 'order_datetime'])
fraud_consumer_norm_prob

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_probability
0,1,2022-02-20,2479.076338,12.073154,9.805431
1,2,2021-08-30,2132.002253,10.716341,9.599514
2,2,2021-09-25,2236.461666,11.265957,10.069851
3,4,2021-10-09,2260.675824,10.692605,9.633302
4,5,2021-10-04,3478.498116,19.014681,10.868365
...,...,...,...,...,...
31032,24079,2021-10-12,4708.648444,22.228621,14.948165
31033,24079,2021-11-08,2816.251072,13.108516,8.940524
31034,24079,2021-11-26,2784.152043,12.953820,8.838622
31035,24081,2021-10-08,4320.715449,19.916225,14.343772


In [103]:
lm = linear_model.LinearRegression()
X = fraud_consumer_norm_prob['normal_total_dollar']
y = fraud_consumer_norm_prob['fraud_probability']

In [104]:
from sklearn.model_selection import train_test_split
X = X.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

In [105]:
lm.fit(X_train,y_train)

In [106]:
lm.coef_

array([0.28019294])

In [107]:
y_pred = lm.predict(X_test)

In [108]:
metrics.r2_score(y_test, y_pred)

0.3841652676484786

In [109]:
metrics.mean_squared_error(y_test, y_pred)

56.94583454553427

In [110]:
metrics.mean_absolute_error(y_test,y_pred)

4.6626583188910065

We have a fraud model that predicts accuractly the fraud probability (+- 4.536%). Now we will find all the outliers in the transactions and give it a fraud probability, and remove it with the fraud probability chance. 

In [111]:
consumer_transactions_day['fraud_prob'] = consumer_transactions_day['normal_total_dollar'].apply(lambda x: (lm.predict([[x]]))[0] if x>2 else 0.00001)

In [112]:
consumer_transactions_day['fraud_prob'] = consumer_transactions_day['fraud_prob']/100

In [113]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob
0,1,2021-02-28,163.668243,0.349362,1.000000e-07
1,1,2021-03-02,301.677498,1.048156,1.000000e-07
2,1,2021-03-04,5.904951,-0.449453,1.000000e-07
3,1,2021-03-06,18.135592,-0.387525,1.000000e-07
4,1,2021-03-09,64.670772,-0.151899,1.000000e-07
...,...,...,...,...,...
8582343,24081,2022-10-19,575.305082,2.176581,1.086968e-01
8582344,24081,2022-10-20,705.340737,2.792478,1.104225e-01
8582345,24081,2022-10-21,5.876394,-0.520443,1.000000e-07
8582346,24081,2022-10-24,68.147276,-0.225505,1.000000e-07


In [146]:
consumer_transactions_day['generated_prob'] = np.random.random(size=len(consumer_transactions_day))

In [147]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.349362,1.000000e-07,0.374540,False
1,1,2021-03-02,301.677498,1.048156,1.000000e-07,0.950714,False
2,1,2021-03-04,5.904951,-0.449453,1.000000e-07,0.731994,False
3,1,2021-03-06,18.135592,-0.387525,1.000000e-07,0.598658,False
4,1,2021-03-09,64.670772,-0.151899,1.000000e-07,0.156019,False
...,...,...,...,...,...,...,...
8582343,24081,2022-10-19,575.305082,2.176581,1.086968e-01,0.935735,False
8582344,24081,2022-10-20,705.340737,2.792478,1.104225e-01,0.186605,False
8582345,24081,2022-10-21,5.876394,-0.520443,1.000000e-07,0.286440,False
8582346,24081,2022-10-24,68.147276,-0.225505,1.000000e-07,0.922499,False


In [148]:
consumer_transactions_day['remove'] = consumer_transactions_day['generated_prob'] < consumer_transactions_day['fraud_prob']

In [149]:
consumer_transactions_day

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.349362,1.000000e-07,0.374540,False
1,1,2021-03-02,301.677498,1.048156,1.000000e-07,0.950714,False
2,1,2021-03-04,5.904951,-0.449453,1.000000e-07,0.731994,False
3,1,2021-03-06,18.135592,-0.387525,1.000000e-07,0.598658,False
4,1,2021-03-09,64.670772,-0.151899,1.000000e-07,0.156019,False
...,...,...,...,...,...,...,...
8582343,24081,2022-10-19,575.305082,2.176581,1.086968e-01,0.935735,False
8582344,24081,2022-10-20,705.340737,2.792478,1.104225e-01,0.186605,False
8582345,24081,2022-10-21,5.876394,-0.520443,1.000000e-07,0.286440,False
8582346,24081,2022-10-24,68.147276,-0.225505,1.000000e-07,0.922499,False


In [150]:
consumer_transactions_removed_fraud = consumer_transactions_day[consumer_transactions_day['remove']==False]

In [151]:
consumer_transactions_removed_fraud

Unnamed: 0,user_id,order_datetime,total_dollar,normal_total_dollar,fraud_prob,generated_prob,remove
0,1,2021-02-28,163.668243,0.349362,1.000000e-07,0.374540,False
1,1,2021-03-02,301.677498,1.048156,1.000000e-07,0.950714,False
2,1,2021-03-04,5.904951,-0.449453,1.000000e-07,0.731994,False
3,1,2021-03-06,18.135592,-0.387525,1.000000e-07,0.598658,False
4,1,2021-03-09,64.670772,-0.151899,1.000000e-07,0.156019,False
...,...,...,...,...,...,...,...
8582343,24081,2022-10-19,575.305082,2.176581,1.086968e-01,0.935735,False
8582344,24081,2022-10-20,705.340737,2.792478,1.104225e-01,0.186605,False
8582345,24081,2022-10-21,5.876394,-0.520443,1.000000e-07,0.286440,False
8582346,24081,2022-10-24,68.147276,-0.225505,1.000000e-07,0.922499,False


We only removed 8580245 - 8475177 = 105,068 rows which is 1.22% of the data. This additional removal would improve our model as we remove likelihood of fraud as much as we can so we can generate an accurate ranking system for the merchants.