In [1]:
from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession.builder.appName("Regression")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/09/26 16:22:16 WARN Utils: Your hostname, RPro.local resolves to a loopback address: 127.0.0.1; using 10.32.178.129 instead (on interface en0)
22/09/26 16:22:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/26 16:22:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

In [72]:
# read in datasets
merchant_fraud = spark.read.parquet('../data/curated/merchant_fraud.parquet')
consumer_fraud = spark.read.parquet('../data/curated/consumer_fraud.parquet')

In [73]:
merchant_fraud.limit(10)

merchant_abn,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability
38700038932,2021-08-20,14,1524.9292206627556,False,0.01
89502033586,2021-08-20,40,65.44754598876243,False,0.01
70610974780,2021-08-20,27,43.48907978704201,False,0.01
29550468444,2021-08-20,4,157.0,False,0.01
92075595936,2021-08-20,12,95.50018824235848,False,0.01
62422038748,2021-08-20,1,2442.1698743679544,False,0.01
32894483068,2021-08-20,1,228.6561829721828,False,0.01
55958301764,2021-08-20,6,281.354343839723,False,0.01
12543580354,2021-08-20,12,262.53675367314503,False,0.01
26008308191,2021-08-20,6,241.0460609065592,False,0.01


In [74]:
merchant_fraud.count()

1344515

In [75]:
merchant_fraud.where(F.col('fraud_probability') != 0.01).count()

114

About 0.008% of the merchant daily transactions is fraudulent.

In [76]:
consumer_fraud.limit(10)

user_id,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability
18487,2021-08-20,2,73.0925761125627,False,0.01
18552,2021-08-20,2,174.42985218811984,False,0.01
215,2021-08-20,2,48.880342827706,False,0.01
18770,2021-08-20,1,273.2147324712209,False,0.01
18837,2021-08-20,1,23.80026317055564,False,0.01
18849,2021-08-20,3,56.89966757870607,False,0.01
435,2021-08-20,2,21.31016695448368,False,0.01
788,2021-08-20,2,38.272827709859286,False,0.01
19550,2021-08-20,2,17.85711450212551,False,0.01
1117,2021-08-20,3,57.4104700047829,False,0.01


In [77]:
consumer_fraud.count()

8977056

In [78]:
consumer_fraud.where(F.col('fraud_probability') != 0.01).count()

34864

About 0.4% of the consumer daily transactions is fraudulent.

Since the proportion of non-fraud to fraud data in both merchant and consumer fraud datasets is highly imbalanced, resampling is performed on both datasets before fitting a model.

For both training datasets, the proportion used is 550 samples of non-fraud data and 450 samples of fraud data. The proportion of non-fraud data is slightly higher since it represents the majority of the data.

In [79]:
def resampling(majority, minority):
    """
    Performs undersampling on the majority class (spark dataframe)
    and oversampling on the minority class (pandas dataframe) from an imbalanced dataset. 
    Outputs resampled dataset as a pandas dataframe.
    """
    
    # define number of samples needed from each class
    N_MAJOR = 550
    N_MINOR = 450
    FRAC_MAJOR = N_MAJOR / majority.count()
    
    # repeated sampling from the minority class
    n_more_samples = N_MINOR - minority.shape[0] # needs to be less than no. of samples in minority
    if n_more_samples > 0:
        sampled_minority = pd.concat([minority, resample(minority, replace=True, n_samples=n_more_samples, random_state=0)], axis=0)
    else:
        sampled_minority = minority
    
    # undersampling from the majority class
    sampled_majority = majority.sample(FRAC_MAJOR, seed=0).toPandas()
    
    # join the sampled datasets
    resampled = pd.concat([sampled_majority, sampled_minority], axis=0)
    
    return resampled

## Merchant fraud data

### Resampling data

In [80]:
# split merchant fraud data by class
merchant_majority = merchant_fraud.where(F.col('fraud_probability') == 0.01)
merchant_minority = merchant_fraud.where(F.col('fraud_probability') != 0.01).toPandas()

In [81]:
# obtain resampled dataset for modelling
merchant_resampled = resampling(merchant_majority, merchant_minority)

In [82]:
merchant_resampled.head()

Unnamed: 0,merchant_abn,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability
0,45663905997,2021-07-30,6,262.729897,False,0.01
1,15269431008,2021-06-25,16,94.062315,False,0.01
2,26603390734,2021-08-13,2,654.763018,False,0.01
3,66917621463,2021-05-16,4,331.084177,False,0.01
4,12034469787,2021-07-25,4,61.694078,False,0.01


### Feature engineering 

In [83]:
# parse order datetime column to pandas' datetime
merchant_resampled['order_datetime'] = pd.to_datetime(merchant_resampled['order_datetime'])

In [84]:
# create order day of week column - Monday = 0, Sunday = 6
merchant_resampled['order_dayofweek'] = merchant_resampled['order_datetime'].dt.day_of_week

In [85]:
# one-hot encoding on order day of week column
merchant_resampled = pd.get_dummies(data=merchant_resampled, prefix='dow', columns=['order_dayofweek'])

merchant_resampled.head()

Unnamed: 0,merchant_abn,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,45663905997,2021-07-30,6,262.729897,False,0.01,0,0,0,0,1,0,0
1,15269431008,2021-06-25,16,94.062315,False,0.01,0,0,0,0,1,0,0
2,26603390734,2021-08-13,2,654.763018,False,0.01,0,0,0,0,1,0,0
3,66917621463,2021-05-16,4,331.084177,False,0.01,0,0,0,0,0,0,1
4,12034469787,2021-07-25,4,61.694078,False,0.01,0,0,0,0,0,0,1


In [86]:
# split dataset into predictors and target variables
X = pd.concat([merchant_resampled.iloc[:,2:4], merchant_resampled.iloc[:,6:]], axis=1)
y = merchant_resampled.iloc[:,4]

In [87]:
X.head()

Unnamed: 0,transaction_count,avg_transaction_amt,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,6,262.729897,0,0,0,0,1,0,0
1,16,94.062315,0,0,0,0,1,0,0
2,2,654.763018,0,0,0,0,1,0,0
3,4,331.084177,0,0,0,0,0,0,1
4,4,61.694078,0,0,0,0,0,0,1


In [88]:
# convert target variable from boolean to integers
y = y.replace({True: 1, False: 0})

### Model fitting

In [90]:
# cross validation
lgr_merchant = LogisticRegression(max_iter=250)
acc_scores = cross_val_score(lgr_merchant, X, y, scoring='accuracy', cv=3)
f1_scores = cross_val_score(lgr_merchant, X, y, scoring='f1', cv=3)
print(acc_scores)
print('Average cross-validation accuracy: ', np.mean(acc_scores))
print(f1_scores)
print('Average cross-validation F1 score: ', np.mean(f1_scores))


lgr_merchant_fit = lgr_merchant.fit(X, y)

[0.918429   0.9        0.91212121]
Average cross-validation accuracy:  0.9101834050474533
[0.90459364 0.88501742 0.89530686]
Average cross-validation F1 score:  0.8949726401281785


## Consumer fraud data

Apply the same pipeline to consumer fraud dataset.

### Resampling data

In [91]:
# split consumer fraud data by class
consumer_majority = consumer_fraud.where(F.col('fraud_probability') == 0.01)
consumer_minority = consumer_fraud.where(F.col('fraud_probability') != 0.01).toPandas()

                                                                                

In [92]:
# obtain resampled dataset for modelling
consumer_resampled = resampling(consumer_majority, consumer_minority)

In [93]:
consumer_resampled.head()

Unnamed: 0,user_id,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability
0,15788,2021-08-15,1,201.365642,False,0.01
1,16608,2021-05-20,2,263.009535,False,0.01
2,21853,2021-08-27,1,16.136864,False,0.01
3,11998,2021-08-18,1,399.775259,False,0.01
4,12029,2021-06-18,2,28.930171,False,0.01


### Feature engineering

In [94]:
# parse order datetime column to pandas' datetime
consumer_resampled['order_datetime'] = pd.to_datetime(consumer_resampled['order_datetime'])

In [95]:
# create order day of week column - Monday = 0, Sunday = 6
consumer_resampled['order_dayofweek'] = consumer_resampled['order_datetime'].dt.day_of_week

In [96]:
# one-hot encoding on order day of week column
consumer_resampled = pd.get_dummies(data=consumer_resampled, prefix='dow', columns=['order_dayofweek'])

consumer_resampled.head()

Unnamed: 0,user_id,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,15788,2021-08-15,1,201.365642,False,0.01,0,0,0,0,0,0,1
1,16608,2021-05-20,2,263.009535,False,0.01,0,0,0,1,0,0,0
2,21853,2021-08-27,1,16.136864,False,0.01,0,0,0,0,1,0,0
3,11998,2021-08-18,1,399.775259,False,0.01,0,0,1,0,0,0,0
4,12029,2021-06-18,2,28.930171,False,0.01,0,0,0,0,1,0,0


In [97]:
# split dataset into predictors and target variables
X = pd.concat([consumer_resampled.iloc[:,2:4], consumer_resampled.iloc[:,6:]], axis=1)
y = consumer_resampled.iloc[:,4]

In [98]:
X.head()

Unnamed: 0,transaction_count,avg_transaction_amt,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,1,201.365642,0,0,0,0,0,0,1
1,2,263.009535,0,0,0,1,0,0,0
2,1,16.136864,0,0,0,0,1,0,0
3,1,399.775259,0,0,1,0,0,0,0
4,2,28.930171,0,0,0,0,1,0,0


In [99]:
# convert target variable from boolean to integers
y = y.replace({True: 1, False: 0})

In [100]:
lgr_consumer = LogisticRegression(max_iter=250)
acc_scores = cross_val_score(lgr_consumer, X, y, scoring='accuracy', cv=3)
f1_scores = cross_val_score(lgr_consumer, X, y, scoring='f1', cv=3)
print(acc_scores)
print('Average cross-validation accuracy: ', np.mean(acc_scores))
print(f1_scores)
print('Average cross-validation F1 score: ', np.mean(f1_scores))

lgr_consumer_fit = lgr_consumer.fit(X, y)

[0.99923722 0.99813544 0.99932192]
Average cross-validation accuracy:  0.9988981925564021
[0.99961292 0.99905441 0.99965591]
Average cross-validation F1 score:  0.9994410826731679


## Generating predictions for full datasets

In [101]:
df_merchant = pd.read_parquet('../data/curated/merchant_fraud.parquet')
df_consumer = pd.read_parquet('../data/curated/consumer_fraud.parquet')

In [None]:
# df_merchant = merchant_fraud.toPandas()
# df_consumer = consumer_fraud.toPandas()

Predictions for merchant.

In [102]:
# order day of week feature engineering
df_merchant['order_datetime'] = pd.to_datetime(df_merchant['order_datetime'])
df_merchant['order_dayofweek'] = df_merchant['order_datetime'].dt.day_of_week
df_merchant = pd.get_dummies(data=df_merchant, prefix='dow', columns=['order_dayofweek'])

df_merchant.head()

Unnamed: 0,merchant_abn,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,90173050473,2021-08-20,41,245.51039,False,0.01,0,0,0,0,1,0,0
1,91455531890,2021-08-20,3,495.614249,False,0.01,0,0,0,0,1,0,0
2,14480530534,2021-08-20,2,1636.591724,False,0.01,0,0,0,0,1,0,0
3,54062916822,2021-08-20,1,33.805382,False,0.01,0,0,0,0,1,0,0
4,73225085327,2021-08-20,1,342.058728,False,0.01,0,0,0,0,1,0,0


In [103]:
# get predictor and target variables
X_merchant = pd.concat([df_merchant.iloc[:,2:4], df_merchant.iloc[:,6:]], axis=1)
y_merchant = df_merchant.iloc[:,4]
y_merchant = y_merchant.replace({True: 1, False: 0})

In [40]:
from sklearn.model_selection import cross_val_predict

In [104]:
# get predictions
y_merchant_pred = lgr_merchant_fit.predict(X_merchant)

In [105]:
y_merchant_pred

array([0, 0, 0, ..., 0, 0, 0])

Predictions for consumer.

In [106]:
# order day of week feature engineering
df_consumer['order_datetime'] = pd.to_datetime(df_consumer['order_datetime'])
df_consumer['order_dayofweek'] = df_consumer['order_datetime'].dt.day_of_week
df_consumer = pd.get_dummies(data=df_consumer, prefix='dow', columns=['order_dayofweek'])

df_consumer.head()

Unnamed: 0,user_id,order_datetime,transaction_count,avg_transaction_amt,is_fraud,fraud_probability,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,18488,2021-08-20,3,156.326574,False,0.01,0,0,0,0,1,0,0
1,686,2021-08-20,2,115.939422,False,0.01,0,0,0,0,1,0,0
2,19292,2021-08-20,1,961.793144,False,0.01,0,0,0,0,1,0,0
3,778,2021-08-20,2,32.2216,False,0.01,0,0,0,0,1,0,0
4,786,2021-08-20,1,80.293376,False,0.01,0,0,0,0,1,0,0


In [107]:
# get predictor and target variables
X_consumer = pd.concat([df_consumer.iloc[:,2:4], df_consumer.iloc[:,6:]], axis=1)
y_consumer = df_consumer.iloc[:,4]
y_consumer = y_consumer.replace({True: 1, False: 0})

In [108]:
# get predictions
y_consumer_pred = lgr_consumer_fit.predict(X_consumer)

In [109]:
y_consumer_pred

array([0, 0, 1, ..., 0, 0, 0])

<h3>Add Predictions to Table<h3>

Get predicted fraud rate for each consumer

In [110]:
df_consumer['predicted_fraud'] = y_consumer_pred

In [130]:
consumer_fraud_count = df_consumer.groupby(['user_id']).agg({'predicted_fraud': 'sum'})
consumer_regular_count = df_consumer.groupby(['user_id']).size()

In [145]:
consumer_fraud_rate = consumer_fraud_count.merge(consumer_regular_count.to_frame(), left_index=True, right_index=True)
consumer_fraud_rate['rate'] = consumer_fraud_rate['predicted_fraud']/consumer_fraud_rate[0]
consumer_fraud_rate = consumer_fraud_rate.drop(columns = ['predicted_fraud', 0])

Repeat for merchants

In [147]:
df_merchant['predicted_fraud'] = y_merchant_pred

In [148]:
merchant_fraud_count = df_merchant.groupby(['merchant_abn']).agg({'predicted_fraud': 'sum'})
merchant_regular_count = df_merchant.groupby(['merchant_abn']).size()

In [149]:
merchant_fraud_rate = merchant_fraud_count.merge(merchant_regular_count.to_frame(), left_index=True, right_index=True)
merchant_fraud_rate['rate'] = merchant_fraud_rate['predicted_fraud']/merchant_fraud_rate[0]
merchant_fraud_rate = merchant_fraud_rate.drop(columns = ['predicted_fraud', 0])