# Fraud Prediction Modelling

## import library

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import rand
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/18 15:57:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## preprocessing

### load data

In [125]:
transactions = spark.read.parquet('../../data/tables/transactions_20210228_20210827_snapshot')\
    .union(spark.read.parquet('../../data/tables/transactions_20210828_20220227_snapshot'))

In [54]:
probs_merchant = spark.read.option('header', True).csv('../../data/tables/merchant_fraud_probability.csv')
probs_consumer= spark.read.option('header', True).csv('../../data/tables/consumer_fraud_probability.csv')

In [55]:
consumers = spark.read.parquet("../../data/curated/consumer/")
merchants = spark.read.option('header',True).csv("../../data/curated/merchant.csv")

In [105]:
mean_total_income = spark.read.option('header', True).csv('../../data/curated/clean_consumer_post_total_income.csv')

### Merge tables

In [107]:
mean_total_income = mean_total_income.select('user_id','Mean_Total_Income')

In [87]:
mean_total_income.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- Mean_Total_Income: double (nullable = true)



In [108]:
mean_total_income = mean_total_income.withColumn('Mean_Total_Income', F.col('Mean_Total_Income').cast('double'))

In [60]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [61]:
probs_consumer.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [62]:
probs_merchant.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [88]:
# match the data types from transaction file for merging preparation
transactions = transactions.withColumn('user_id', F.col('user_id').cast('string'))\
                .withColumn('order_datetime', F.col('order_datetime').cast('string'))\
                .withColumn('merchant_abn', F.col('merchant_abn').cast('string'))
probs_consumer =  probs_consumer.withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [89]:
# match the data types from transaction file for merging preparation
probs_merchant = probs_merchant.withColumn('merchant_abn', F.col('merchant_abn').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [126]:
# merge transaction file with merchants'/consumers' fraud probability based on merchant abn or user id respectively by left join
result = transactions.join(probs_merchant, on = ['merchant_abn', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_prob')
result = result.join(probs_consumer, on = ['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_prob')

In [128]:
# replace all the missing value with 0.01 as default fraud prob
result = result.na.fill(value=0.01, subset=['merchant_prob', 'consumer_prob'])

In [129]:
# We set benchmark as 5%, because we focus on False Positive instead of False Negative
result = result.withColumn('is_fraud', F.when((F.col('merchant_prob') > 5) | (F.col('consumer_prob') > 5), 1).otherwise(0))
result

user_id,order_datetime,merchant_abn,dollar_value,order_id,merchant_prob,consumer_prob,is_fraud
18478,2021-08-20,62191208634,63.255848959735246,949a63c8-29f7-4ab...,0.01,0.01,0
2,2021-08-20,15549624934,130.3505283105634,6a84c3cf-612a-457...,0.01,0.01,0
18479,2021-08-20,64403598239,120.15860593212784,b10dcc33-e53f-425...,0.01,0.01,0
3,2021-08-20,60956456424,136.6785200286976,0f09c5a5-784e-447...,0.01,0.01,0
18479,2021-08-20,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,0.01,0.01,0
3,2021-08-20,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,0.01,0.01,0
18479,2021-08-20,67609108741,86.4040605836911,d0e180f0-cb06-42a...,0.01,0.01,0
3,2021-08-20,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,0.01,0.01,0
18482,2021-08-20,70501974849,68.75486276223054,8505fb33-b69a-412...,0.01,0.01,0
4,2021-08-20,49891706470,48.89796461900801,ed11e477-b09f-4ae...,0.01,0.01,0


In [112]:
print('In {} transactions, {} are detected as fraud'.format(result.count(), result.filter(F.col('is_fraud') == 1).count()))



In 8151584 transactions, 84214 are detected as fraud


                                                                                

In [130]:
# save the training dataset as parquet
result = result.drop('order_id')
result.cache()

                                                                                

user_id,order_datetime,merchant_abn,dollar_value,merchant_prob,consumer_prob,is_fraud
18478,2021-08-20,62191208634,63.255848959735246,0.01,0.01,0
2,2021-08-20,15549624934,130.3505283105634,0.01,0.01,0
18479,2021-08-20,64403598239,120.15860593212784,0.01,0.01,0
3,2021-08-20,60956456424,136.6785200286976,0.01,0.01,0
18479,2021-08-20,94493496784,72.96316578355305,0.01,0.01,0
3,2021-08-20,76819856970,448.529684285612,0.01,0.01,0
18479,2021-08-20,67609108741,86.4040605836911,0.01,0.01,0
3,2021-08-20,34096466752,301.5793450525113,0.01,0.01,0
18482,2021-08-20,70501974849,68.75486276223054,0.01,0.01,0
4,2021-08-20,49891706470,48.89796461900801,0.01,0.01,0


In [114]:
consumers.printSchema()

root
 |-- consumer_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



In [71]:
merchants.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: string (nullable = true)



In [131]:
result  = result.join(merchants, on='merchant_abn', how = 'left')
result = result.join(consumers, on='user_id', how = 'left')
result.cache()

                                                                                

user_id,merchant_abn,order_datetime,dollar_value,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender
18478,62191208634,2021-08-20,63.255848959735246,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male
2,15549624934,2021-08-20,130.3505283105634,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female
18479,64403598239,2021-08-20,120.15860593212784,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female
3,60956456424,2021-08-20,136.6785200286976,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female
18479,94493496784,2021-08-20,72.96316578355305,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female
3,76819856970,2021-08-20,448.529684285612,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female
18479,67609108741,2021-08-20,86.4040605836911,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female
3,34096466752,2021-08-20,301.5793450525113,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female
18482,70501974849,2021-08-20,68.75486276223054,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male
4,49891706470,2021-08-20,48.89796461900801,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female


In [132]:
result = result.filter(F.col('dollar_value') >= 1).na.drop(subset = 'name')

In [171]:
result.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- merchant_prob: float (nullable = false)
 |-- consumer_prob: float (nullable = false)
 |-- is_fraud: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- Mean_Total_Income: double (nullable = true)



In [170]:
result = result.withColumn('user_id', F.col('user_id').cast('string'))\
          .withColumn('order_datetime', F.col('order_datetime').cast('string'))\
          .withColumn('take_rate', F.col('take_rate').cast('float'))

In [138]:
result = result.join(mean_total_income, on = 'user_id', how = "left")
result.cache()

                                                                                

user_id,merchant_abn,order_datetime,dollar_value,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender,Mean_Total_Income
18478,62191208634,2021-08-20,63.255848959735246,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male,64714.0
2,15549624934,2021-08-20,130.3505283105634,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female,61938.0
18479,64403598239,2021-08-20,120.15860593212784,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female,49154.0
3,60956456424,2021-08-20,136.6785200286976,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female,44246.0
18479,94493496784,2021-08-20,72.96316578355305,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female,49154.0
3,76819856970,2021-08-20,448.529684285612,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female,44246.0
18479,67609108741,2021-08-20,86.4040605836911,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female,49154.0
3,34096466752,2021-08-20,301.5793450525113,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female,44246.0
18482,70501974849,2021-08-20,68.75486276223054,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male,78871.0
4,49891706470,2021-08-20,48.89796461900801,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female,54212.0


In [139]:
result.filter(F.col('Mean_Total_Income').isNull()).count()

                                                                                

0

In [146]:
result.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- merchant_prob: float (nullable = false)
 |-- consumer_prob: float (nullable = false)
 |-- is_fraud: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: string (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- Mean_Total_Income: double (nullable = true)



## Feature Engineering

### bin numeric features

In [59]:
#value_max = result.select('dollar_value').orderBy(F.col('dollar_value'),  ascending= False).collect()[0][0]

                                                                                

In [60]:
#value_min = result.select('dollar_value').orderBy(F.col('dollar_value')).collect()[0][0]

                                                                                

In [61]:
#print('dollar_value range:[{}, {}]'.format(value_min, value_max))

dollar_value range:[1.0000012988409446, 105193.88578925544]


In [62]:
# bins = []
# i  = 0
# while i <= 70000:
#   bins.append(i)
#   i += 200

In [63]:
#bins.append(float('Inf'))

In [64]:
#bucketizer = Bucketizer(splits=bins, inputCol="dollar_value", outputCol="dollar_value_buckets")
#result = bucketizer.setHandleInvalid("keep").transform(result)

In [67]:
#result = result.withColumn('take_rate', F.col('take_rate').cast('double'))

In [68]:
# take_rate_max = result.select('take_rate').orderBy(F.col('take_rate'),  ascending= False).collect()[0][0]
# take_rate_min = result.select('take_rate').orderBy(F.col('take_rate')).collect()[0][0]

                                                                                

In [69]:
# print('take_rate range:[{}, {}]'.format(take_rate_min, take_rate_max))

take_rate range:[0.1, 7.0]


In [70]:
# bucketizer = Bucketizer(splits=[0,1,2,3,4,5,6,7,8], inputCol="take_rate", outputCol="take_rate_buckets")
# result = bucketizer.setHandleInvalid("keep").transform(result)

In [72]:
# result.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- merchant_prob: float (nullable = false)
 |-- consumer_prob: float (nullable = false)
 |-- is_fraud: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- dollar_value_buckets: double (nullable = true)
 |-- take_rate_buckets: double (nullable = true)



### Index ordinal features

In [152]:
indexed_features = ['order_datetime', 'revenue_level','tags', 'gender']

In [173]:
# We give all values in non-numeric features an index in order to make it ordinal or one-hot encoded
indexers =[]
for col in indexed_features:
  indexers.append(StringIndexer(inputCol=col, outputCol = col+"_index"))

indexers

[StringIndexer_512a0ca8904c,
 StringIndexer_2ad9266926e1,
 StringIndexer_10b1aef82d20,
 StringIndexer_7eae7ef488c3]

In [174]:
pipeline = Pipeline(stages=indexers)

In [175]:
indexed_result = pipeline.fit(result).transform(result)

                                                                                

In [156]:
indexed_result

user_id,merchant_abn,order_datetime,dollar_value,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender,Mean_Total_Income,order_datetime_index,revenue_level_index,tags_index,gender_index
18478,62191208634,2021-08-20,63.255848959735246,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male,64714.0,49.0,2.0,8.0,0.0
2,15549624934,2021-08-20,130.3505283105634,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female,61938.0,49.0,2.0,7.0,1.0
18479,64403598239,2021-08-20,120.15860593212784,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female,49154.0,49.0,0.0,13.0,1.0
3,60956456424,2021-08-20,136.6785200286976,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female,44246.0,49.0,1.0,1.0,1.0
18479,94493496784,2021-08-20,72.96316578355305,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female,49154.0,49.0,0.0,1.0,1.0
3,76819856970,2021-08-20,448.529684285612,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female,44246.0,49.0,1.0,0.0,1.0
18479,67609108741,2021-08-20,86.4040605836911,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female,49154.0,49.0,4.0,5.0,1.0
3,34096466752,2021-08-20,301.5793450525113,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female,44246.0,49.0,1.0,3.0,1.0
18482,70501974849,2021-08-20,68.75486276223054,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male,78871.0,49.0,1.0,3.0,0.0
4,49891706470,2021-08-20,48.89796461900801,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female,54212.0,49.0,0.0,0.0,1.0


### One hot encoding

In [176]:
# one-hot-encoding the numeric indices
ohe = []
ohe.append(OneHotEncoder(inputCol="tags_index", outputCol="tagsOHE"))
ohe.append(OneHotEncoder(inputCol="gender_index", outputCol="genderOHE"))

In [177]:
pipeline = Pipeline(stages=ohe)

In [178]:
indexed_result = pipeline.fit(indexed_result).transform(indexed_result)

In [166]:
indexed_result

user_id,merchant_abn,order_datetime,dollar_value,merchant_prob,consumer_prob,is_fraud,name,tags,revenue_level,take_rate,consumer_id,state,postcode,gender,Mean_Total_Income,order_datetime_index,revenue_level_index,tags_index,gender_index,tagsOHE,genderOHE
18478,62191208634,2021-08-20,63.255848959735246,0.01,0.01,0,Cursus Non Egesta...,furniture,c,2.17,651338,TAS,7001,Male,64714.0,49.0,2.0,8.0,0.0,"(23,[8],[1.0])","(2,[0],[1.0])"
2,15549624934,2021-08-20,130.3505283105634,0.01,0.01,0,Commodo Associates,opticians,c,2.76,179208,NSW,2782,Female,61938.0,49.0,2.0,7.0,1.0,"(23,[7],[1.0])","(2,[1],[1.0])"
18479,64403598239,2021-08-20,120.15860593212784,0.01,0.01,0,Lobortis Ultrices...,music,a,6.31,467663,TAS,7010,Female,49154.0,49.0,0.0,13.0,1.0,"(23,[13],[1.0])","(2,[1],[1.0])"
3,60956456424,2021-08-20,136.6785200286976,0.01,0.01,0,Ultricies Digniss...,gift,b,4.69,1194530,NT,862,Female,44246.0,49.0,1.0,1.0,1.0,"(23,[1],[1.0])","(2,[1],[1.0])"
18479,94493496784,2021-08-20,72.96316578355305,0.01,0.01,0,Dictum Phasellus ...,gift,a,5.65,467663,TAS,7010,Female,49154.0,49.0,0.0,1.0,1.0,"(23,[1],[1.0])","(2,[1],[1.0])"
3,76819856970,2021-08-20,448.529684285612,0.01,0.01,0,Egestas Blandit Ltd,tent,b,3.19,1194530,NT,862,Female,44246.0,49.0,1.0,0.0,1.0,"(23,[0],[1.0])","(2,[1],[1.0])"
18479,67609108741,2021-08-20,86.4040605836911,0.01,0.01,0,Metus Sit Amet In...,cable,e,0.38,467663,TAS,7010,Female,49154.0,49.0,4.0,5.0,1.0,"(23,[5],[1.0])","(2,[1],[1.0])"
3,34096466752,2021-08-20,301.5793450525113,0.01,0.01,0,Nullam Enim Ltd,computer,b,3.22,1194530,NT,862,Female,44246.0,49.0,1.0,3.0,1.0,"(23,[3],[1.0])","(2,[1],[1.0])"
18482,70501974849,2021-08-20,68.75486276223054,0.01,0.01,0,Facilisis Lorem T...,computer,b,3.3,918448,NSW,1430,Male,78871.0,49.0,1.0,3.0,0.0,"(23,[3],[1.0])","(2,[0],[1.0])"
4,49891706470,2021-08-20,48.89796461900801,0.01,0.01,0,Non Vestibulum In...,tent,a,5.8,154128,NSW,2780,Female,54212.0,49.0,0.0,0.0,1.0,"(23,[0],[1.0])","(2,[1],[1.0])"


### Feature Selection and Vectorization

In [167]:
feature_selected = ['dollar_value','take_rate','Mean_Total_Income','order_datetime_index','revenue_level_index','tagsOHE','genderOHE']

In [179]:
assembler = VectorAssembler(inputCols=feature_selected ,outputCol='features')

In [180]:
output = assembler.transform(indexed_result)

In [185]:
final_data = output.select('features','is_fraud')

In [189]:
(final_data.filter(F.col('is_fraud') == 1).count())/(final_data.filter(F.col('is_fraud') == 0).count())

0.009780748354606692

In [265]:
final_data.filter(F.col('is_fraud') == 1).count(), final_data.filter(F.col('is_fraud') == 0).count()

(75108, 7679167)

In [279]:
# imbalanced distribution of two classes. We decide to split the data according to their class and make the distribution balanced
fraud_data = final_data.filter(F.col('is_fraud') == 1)
normal_data = final_data.filter(F.col('is_fraud') == 0).randomSplit([0.01,0.99])[0]

## Split data

In [280]:
train_fraud,test_fraud = fraud_data.randomSplit([0.7,0.3])
train_normal,test_normal = normal_data.randomSplit([0.7,0.3])

In [281]:
train_data = train_fraud.union(train_normal).orderBy(rand())
test_data = test_fraud.union(test_normal).orderBy(rand())

In [213]:
final_data.count()

7754275

In [282]:
train_data.count(),test_data.count()

                                                                                

(106326, 45619)

## Logistic Regression

In [283]:
lr = LogisticRegression(labelCol='is_fraud')

In [284]:
fitted_model = lr.fit(train_data)

                                                                                

### Evaluation

In [285]:
pred_and_labels = fitted_model.evaluate(test_data)

                                                                                

In [286]:
score_and_label = pred_and_labels.predictions.select('prediction', 'is_fraud').withColumnRenamed('is_fraud', 'label')

In [287]:
evaluator = MulticlassClassificationEvaluator()

In [288]:
evaluator.setPredictionCol("prediction")

MulticlassClassificationEvaluator_4323a28ff527

In [294]:
print("Accuracy: " + str(evaluator.evaluate(score_and_label)))



Accuracy: 0.7381659159352582


                                                                                

In [290]:
tp = score_and_label.filter((F.col('prediction') == 1) & (F.col('label') == 1)).count()

                                                                                

In [291]:
fn = score_and_label.filter((F.col('prediction') == 0) & (F.col('label') == 1)).count()

                                                                                

In [293]:
recall = tp/(tp+fn)
print('recall: ' + str(recall))

recall: 0.6422205471555891


## 

Prediction Models

Since prediction outputs should be discrete labels (True or False for fraud checking), classification models are selected:
1. Naive Bayes Models:  unbalanced prior, not suitable
2. Logistic Regression
3. Support Vector Machines (SVMs)
4. Clustering Analysis
5. Dicision tree
6. Random Forest
7. Neural network