# Fraud Prediction Modelling

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

from pyspark.mllib.classification import LogisticRegressionModel
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline 
from pyspark.ml.feature import VectorAssembler 

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/17 18:13:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/17 18:13:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/17 18:13:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## preprocessing

### load data

In [4]:
transactions = spark.read.parquet('../../data/tables/transactions_20210228_20210827_snapshot')\
    .union(spark.read.parquet('../../data/tables/transactions_20210828_20220227_snapshot'))

                                                                                

In [6]:
probs_merchant = spark.read.option('header', True).csv('../../data/tables/merchant_fraud_probability.csv')
probs_consumer= spark.read.option('header', True).csv('../../data/tables/consumer_fraud_probability.csv')

In [26]:
consumers = spark.read.parquet("../../data/curated/consumer/")
merchants = spark.read.option('header',True).csv("../../data/curated/merchant.csv")

In [13]:
transactions.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)



In [14]:
probs_consumer.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: float (nullable = true)



In [15]:
probs_merchant.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [17]:
# match the data types from transaction file for merging preparation
transactions = transactions.withColumn('user_id', F.col('user_id').cast('string'))\
                .withColumn('order_datetime', F.col('order_datetime').cast('string'))\
                .withColumn('merchant_abn', F.col('merchant_abn').cast('string'))
probs_consumer =  probs_consumer.withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [37]:
# match the data types from transaction file for merging preparation
probs_merchant = probs_merchant.withColumn('merchant_abn', F.col('merchant_abn').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [49]:
# merge transaction file with merchants'/consumers' fraud probability based on merchant abn or user id respectively by left join
result = transactions.join(probs_merchant, on = ['merchant_abn', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_prob')
result = result.join(probs_consumer, on = ['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_prob')

In [50]:
# replace all the missing value with 0.01 as default fraud prob
result = result.na.fill(value=0.01, subset=['merchant_prob', 'consumer_prob'])

In [51]:
# 50% fraud probility is used as the benchmark for checking fraud transaction
# transactions with either consumer/merchant fraud probability strictly higher than 50% will be considered as fraud data
# This number might need to be adjusted during training model because of sample shortage
result = result.withColumn('is_fraud', F.when((F.col('merchant_prob') > 50) | (F.col('consumer_prob') > 50), 1).otherwise(0))
result

user_id,order_datetime,merchant_abn,dollar_value,order_id,merchant_prob,consumer_prob,is_fraud
18478,2021-08-20,62191208634,63.255848959735246,949a63c8-29f7-4ab...,0.01,0.01,0
2,2021-08-20,15549624934,130.3505283105634,6a84c3cf-612a-457...,0.01,0.01,0
18479,2021-08-20,64403598239,120.15860593212784,b10dcc33-e53f-425...,0.01,0.01,0
3,2021-08-20,60956456424,136.6785200286976,0f09c5a5-784e-447...,0.01,0.01,0
18479,2021-08-20,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,0.01,0.01,0
3,2021-08-20,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,0.01,0.01,0
18479,2021-08-20,67609108741,86.4040605836911,d0e180f0-cb06-42a...,0.01,0.01,0
3,2021-08-20,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,0.01,0.01,0
18482,2021-08-20,70501974849,68.75486276223054,8505fb33-b69a-412...,0.01,0.01,0
4,2021-08-20,49891706470,48.89796461900801,ed11e477-b09f-4ae...,0.01,0.01,0


In [41]:
print('In {} transactions, {} are detected as fraud'.format(result.count(), result.filter(F.col('is_fraud') == 1).count()))



In 8151584 transactions, 1572 are detected as fraud


                                                                                

In [42]:
# save the training dataset as parquet
result = result.drop('order_id')
result.cache()

                                                                                

user_id,order_datetime,merchant_abn,dollar_value,merchant_prob,consumer_prob,is_fraud
18478,2021-08-20,62191208634,63.255848959735246,0.01,0.01,0
2,2021-08-20,15549624934,130.3505283105634,0.01,0.01,0
18479,2021-08-20,64403598239,120.15860593212784,0.01,0.01,0
3,2021-08-20,60956456424,136.6785200286976,0.01,0.01,0
18479,2021-08-20,94493496784,72.96316578355305,0.01,0.01,0
3,2021-08-20,76819856970,448.529684285612,0.01,0.01,0
18479,2021-08-20,67609108741,86.4040605836911,0.01,0.01,0
3,2021-08-20,34096466752,301.5793450525113,0.01,0.01,0
18482,2021-08-20,70501974849,68.75486276223054,0.01,0.01,0
4,2021-08-20,49891706470,48.89796461900801,0.01,0.01,0


In [43]:
consumers.printSchema()

root
 |-- consumer_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



In [44]:
merchants.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: string (nullable = true)



In [52]:
result  = result.join(merchants, on='merchant_abn', how = 'left')
result.cache()
result = result.join(consumers, on='user_id', how = 'left')
result.cache()

                                                                                

user_id,merchant_abn,order_datetime,dollar_value,order_id,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender
18478,62191208634,2021-08-20,63.255848959735246,949a63c8-29f7-4ab...,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male
2,15549624934,2021-08-20,130.3505283105634,6a84c3cf-612a-457...,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female
18479,64403598239,2021-08-20,120.15860593212784,b10dcc33-e53f-425...,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female
3,60956456424,2021-08-20,136.6785200286976,0f09c5a5-784e-447...,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female
18479,94493496784,2021-08-20,72.96316578355305,f6c78c1a-4600-4c5...,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female
3,76819856970,2021-08-20,448.529684285612,5ace6a24-cdf0-4aa...,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female
18479,67609108741,2021-08-20,86.4040605836911,d0e180f0-cb06-42a...,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female
3,34096466752,2021-08-20,301.5793450525113,6fb1ff48-24bb-4f9...,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female
18482,70501974849,2021-08-20,68.75486276223054,8505fb33-b69a-412...,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male
4,49891706470,2021-08-20,48.89796461900801,ed11e477-b09f-4ae...,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female


In [53]:
result = result.filter(F.col('dollar_value') >= 1).na.drop(subset = 'name')

### bin numeric features

In [59]:
value_max = result.select('dollar_value').orderBy(F.col('dollar_value'),  ascending= False).collect()[0][0]

                                                                                

In [60]:
value_min = result.select('dollar_value').orderBy(F.col('dollar_value')).collect()[0][0]

                                                                                

In [61]:
print('dollar_value range:[{}, {}]'.format(value_min, value_max))

dollar_value range:[1.0000012988409446, 105193.88578925544]


In [62]:
bins = []
i  = 0
while i <= 70000:
  bins.append(i)
  i += 200

In [63]:
bins.append(float('Inf'))

In [64]:
bucketizer = Bucketizer(splits=bins, inputCol="dollar_value", outputCol="dollar_value_buckets")
result = bucketizer.setHandleInvalid("keep").transform(result)

In [67]:
result = result.withColumn('take_rate', F.col('take_rate').cast('double'))

In [68]:
take_rate_max = result.select('take_rate').orderBy(F.col('take_rate'),  ascending= False).collect()[0][0]
take_rate_min = result.select('take_rate').orderBy(F.col('take_rate')).collect()[0][0]

                                                                                

In [69]:
print('take_rate range:[{}, {}]'.format(take_rate_min, take_rate_max))

take_rate range:[0.1, 7.0]


In [70]:
bucketizer = Bucketizer(splits=[0,1,2,3,4,5,6,7,8], inputCol="take_rate", outputCol="take_rate_buckets")
result = bucketizer.setHandleInvalid("keep").transform(result)

In [72]:
result.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- merchant_prob: float (nullable = false)
 |-- consumer_prob: float (nullable = false)
 |-- is_fraud: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value_buckets: double (nullable = true)
 |-- take_rate_buckets: double (nullable = true)



### vectorization

In [92]:
#indexed_features = ['user_id','merchant_abn', 'order_datetime', 'tags', 'revenue_level','state',	'postcode',	'gender', 'dollar_value_buckets',	'take_rate_buckets']

In [93]:
'''
indexers =[]
for col in indexed_features:
  indexers.append(StringIndexer(inputCol=col, outputCol = col+"_index"))

indexers
'''

[StringIndexer_30666376522d,
 StringIndexer_8c0a3bad0cf6,
 StringIndexer_824e0c5a38c7,
 StringIndexer_6f5397d53f92,
 StringIndexer_99ca01fded61,
 StringIndexer_7c0a5183fd6a,
 StringIndexer_e90a613486a4,
 StringIndexer_f8e056a9548e,
 StringIndexer_ef4ddef67ecc,
 StringIndexer_42ec6ad5ce46]

In [94]:
#indexers.append(StringIndexer(inputCol='is_fraud', outputCol = "label"))

In [95]:
#pipeline = Pipeline(stages=indexers)

In [96]:
#Fitting a model to the input dataset. 
#indexed_result = pipeline.fit(result).transform(result)

                                                                                

In [98]:
#vectorAssembler = VectorAssembler(inputCols = ['user_id_index',	'merchant_abn_index',	'order_datetime_index',	'tags_index',	'revenue_level_index',	'state_index',	'postcode_index',	'gender_index',	'dollar_value_buckets_index',	'take_rate_buckets_index'],outputCol = 'features') 
#vindexed_result_df = vectorAssembler.transform(indexed_result) 

+-------+------------+--------------+------------------+-------------+-------------+--------+-----------------------------+---------+-------------+---------+-----------+-----+--------+------+--------------------+-----------------+-------------+------------------+--------------------+----------+-------------------+-----------+--------------+------------+--------------------------+-----------------------+-----+---------------------------------------------------+
|user_id|merchant_abn|order_datetime|dollar_value      |merchant_prob|consumer_prob|is_fraud|name                         |tags     |revenue_level|take_rate|consumer_id|state|postcode|gender|dollar_value_buckets|take_rate_buckets|user_id_index|merchant_abn_index|order_datetime_index|tags_index|revenue_level_index|state_index|postcode_index|gender_index|dollar_value_buckets_index|take_rate_buckets_index|label|features                                           |
+-------+------------+--------------+------------------+-------------+

22/09/17 18:03:55 WARN DAGScheduler: Broadcasting large task binary with size 1253.8 KiB


In [99]:
#vindexed_result_df

22/09/17 18:04:10 WARN DAGScheduler: Broadcasting large task binary with size 1253.8 KiB
22/09/17 18:04:10 WARN DAGScheduler: Broadcasting large task binary with size 1253.8 KiB


user_id,merchant_abn,order_datetime,dollar_value,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender,dollar_value_buckets,take_rate_buckets,user_id_index,merchant_abn_index,order_datetime_index,tags_index,revenue_level_index,state_index,postcode_index,gender_index,dollar_value_buckets_index,take_rate_buckets_index,label,features
18478,62191208634,2021-08-20,63.255848959735246,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male,0.0,2.0,18015.0,165.0,49.0,8.0,2.0,5.0,1235.0,0.0,0.0,2.0,0.0,"[18015.0,165.0,49..."
2,15549624934,2021-08-20,130.3505283105634,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female,0.0,2.0,17454.0,984.0,49.0,7.0,2.0,0.0,151.0,1.0,0.0,2.0,0.0,"[17454.0,984.0,49..."
18479,64403598239,2021-08-20,120.15860593212784,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female,0.0,6.0,13961.0,12.0,49.0,13.0,0.0,5.0,2183.0,1.0,0.0,0.0,0.0,"[13961.0,12.0,49...."
3,60956456424,2021-08-20,136.6785200286976,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female,0.0,4.0,13503.0,16.0,49.0,1.0,1.0,6.0,2762.0,1.0,0.0,3.0,0.0,"[13503.0,16.0,49...."
18479,94493496784,2021-08-20,72.96316578355305,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female,0.0,5.0,13961.0,15.0,49.0,1.0,0.0,5.0,2183.0,1.0,0.0,1.0,0.0,"[13961.0,15.0,49...."
3,76819856970,2021-08-20,448.529684285612,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female,2.0,3.0,13503.0,160.0,49.0,0.0,1.0,6.0,2762.0,1.0,2.0,4.0,0.0,"[13503.0,160.0,49..."
18479,67609108741,2021-08-20,86.4040605836911,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female,0.0,0.0,13961.0,156.0,49.0,5.0,4.0,5.0,2183.0,1.0,0.0,6.0,0.0,"[13961.0,156.0,49..."
3,34096466752,2021-08-20,301.5793450525113,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female,1.0,3.0,13503.0,134.0,49.0,3.0,1.0,6.0,2762.0,1.0,1.0,4.0,0.0,"[13503.0,134.0,49..."
18482,70501974849,2021-08-20,68.75486276223054,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male,0.0,3.0,16772.0,186.0,49.0,3.0,1.0,0.0,2533.0,0.0,0.0,4.0,0.0,"[16772.0,186.0,49..."
4,49891706470,2021-08-20,48.89796461900801,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female,0.0,5.0,4044.0,3.0,49.0,0.0,0.0,0.0,191.0,1.0,0.0,1.0,0.0,"[4044.0,3.0,49.0,..."


## try logistic regiression

## Prediction Models

Since prediction outputs should be discrete labels (True or False for fraud checking), classification models are selected:
1. Naive Bayes Models:  unbalanced prior, not suitable
2. Logistic Regression
3. Support Vector Machines (SVMs)
4. Clustering Analysis
5. Dicision tree
6. Random Forest
7. Neural network

In [11]:
# import sklearn.naive_bayes as nb
# ##print(dir(nb))
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# gnb = GaussianNB()
# mnb = MultinomialNB()
# bnb = BernoulliNB()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=30034)
# gnb.fit(X_train, y_train)
# acc = gnb.score(X_test, y_test)
# print("\nGNB score %f " %acc)

    
# mnb.fit(X_train, y_train)
# acc = mnb.score(X_test, y_test)
# print("MNB score %f " %acc)

    
# bnb.fit(X_train, y_train)
# acc = bnb.score(X_test, y_test)
# print("BNB score %f " %acc)


    

TypeError: Singleton array array(+-------+--------------+------------+------------------+-------------+-------------+
|user_id|order_datetime|merchant_abn|      dollar_value|merchant_prob|consumer_prob|
+-------+--------------+------------+------------------+-------------+-------------+
|  14935|    2021-11-26| 79417999332|136.06570809815838|         0.01|         0.01|
|      1|    2021-11-26| 46451548968| 72.61581642788431|         0.01|         0.01|
|  14936|    2021-11-26| 89518629617|3.0783487174439297|         0.01|         0.01|
|      1|    2021-11-26| 49167531725| 51.58228625503599|         0.01|         0.01|
|  14936|    2021-11-26| 31101120643|25.228114942417797|         0.01|         0.01|
|      2|    2021-11-26| 67978471888| 691.5028234458998|         0.01|         0.01|
|  14936|    2021-11-26| 60956456424|102.13952056640888|         0.01|         0.01|
|      2|    2021-11-26| 47644196714| 644.5220654863093|         0.01|         0.01|
|  14938|    2021-11-26| 39649557865|209.12780951421405|    28.690205|         0.01|
|      3|    2021-11-26| 88402174457| 141.0387993699113|         0.01|         0.01|
|  14938|    2021-11-26| 32234779638|102.04090324888061|         0.01|         0.01|
|      4|    2021-11-26| 24015173965|             157.0|         0.01|         0.01|
|  14939|    2021-11-26| 79830510987|246.13590152514766|         0.01|         0.01|
|      5|    2021-11-26| 76646764782|35.463129852951695|         0.01|         0.01|
|  14939|    2021-11-26| 17324645993|24.108258533478818|         0.01|         0.01|
|      5|    2021-11-26| 71041015148| 89.76800577775657|         0.01|         0.01|
|  14939|    2021-11-26| 75089928159|14.806519329659501|         0.01|         0.01|
|      5|    2021-11-26| 80324045558| 62.79195913018835|         0.01|         0.01|
|  14939|    2021-11-26| 50315283629| 575.3321071340728|    29.088158|         0.01|
|      6|    2021-11-26| 11566786699|15.856637357709419|         0.01|         0.01|
+-------+--------------+------------+------------------+-------------+-------------+
only showing top 20 rows
, dtype=object) cannot be considered a valid collection.