# Fraud Prediction Modelling

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/17 02:14:34 WARN Utils: Your hostname, DESKTOP-80AOBLL resolves to a loopback address: 127.0.1.1; using 172.24.54.25 instead (on interface eth0)
22/09/17 02:14:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/17 02:14:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Training Dataset

In [4]:
transactions = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')\
    .union(spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot'))

                                                                                

In [5]:
probs_merchant = spark.read.option('header', True).csv('../data/tables/merchant_fraud_probability.csv')
probs_consumer= spark.read.option('header', True).csv('../data/tables/consumer_fraud_probability.csv')

In [5]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [6]:
probs_consumer.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [6]:
# match the data types from transaction file for merging preparation
probs_consumer = probs_consumer.withColumn('user_id', F.col('user_id').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [8]:
probs_consumer.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: float (nullable = true)



In [9]:
probs_merchant.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [7]:
# match the data types from transaction file for merging preparation
probs_merchant = probs_merchant.withColumn('merchant_abn', F.col('merchant_abn').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [10]:
probs_merchant.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [11]:
probs_merchant

merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902
31334588839,2021-12-26,38.36165958070444
23686790459,2021-12-10,79.4543441508535
14827550074,2021-11-26,46.45775596795885
31334588839,2021-11-26,36.20971272078342
19492220327,2021-12-18,33.819672154331755


In [8]:
# merge transaction file with merchants'/consumers' fraud probability based on merchant abn or user id respectively by left join
result = transactions.join(probs_merchant, on = ['merchant_abn', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_prob')
result = result.join(probs_consumer, on = ['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_prob')

In [9]:
# replace all the missing value with 0.01 as default fraud prob
result = result.na.fill(value=0.01, subset=['merchant_prob', 'consumer_prob'])
result

user_id,order_datetime,merchant_abn,dollar_value,order_id,merchant_prob,consumer_prob
18478,2021-08-20,62191208634,63.255848959735246,949a63c8-29f7-4ab...,0.01,0.01
2,2021-08-20,15549624934,130.3505283105634,6a84c3cf-612a-457...,0.01,0.01
18479,2021-08-20,64403598239,120.15860593212784,b10dcc33-e53f-425...,0.01,0.01
3,2021-08-20,60956456424,136.6785200286976,0f09c5a5-784e-447...,0.01,0.01
18479,2021-08-20,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,0.01,0.01
3,2021-08-20,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,0.01,0.01
18479,2021-08-20,67609108741,86.4040605836911,d0e180f0-cb06-42a...,0.01,0.01
3,2021-08-20,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,0.01,0.01
18482,2021-08-20,70501974849,68.75486276223054,8505fb33-b69a-412...,0.01,0.01
4,2021-08-20,49891706470,48.89796461900801,ed11e477-b09f-4ae...,0.01,0.01


In [10]:
# 50% fraud probility is used as the benchmark for checking fraud transaction
# transactions with either consumer/merchant fraud probability strictly higher than 50% will be considered as fraud data
result = result.withColumn('is_fraud', F.when((F.col('merchant_prob') > 50) | (F.col('consumer_prob') > 50), True).otherwise(False))
result

user_id,order_datetime,merchant_abn,dollar_value,order_id,merchant_prob,consumer_prob,is_fraud
18478,2021-08-20,62191208634,63.255848959735246,949a63c8-29f7-4ab...,0.01,0.01,False
2,2021-08-20,15549624934,130.3505283105634,6a84c3cf-612a-457...,0.01,0.01,False
18479,2021-08-20,64403598239,120.15860593212784,b10dcc33-e53f-425...,0.01,0.01,False
3,2021-08-20,60956456424,136.6785200286976,0f09c5a5-784e-447...,0.01,0.01,False
18479,2021-08-20,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,0.01,0.01,False
3,2021-08-20,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,0.01,0.01,False
18479,2021-08-20,67609108741,86.4040605836911,d0e180f0-cb06-42a...,0.01,0.01,False
3,2021-08-20,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,0.01,0.01,False
18482,2021-08-20,70501974849,68.75486276223054,8505fb33-b69a-412...,0.01,0.01,False
4,2021-08-20,49891706470,48.89796461900801,ed11e477-b09f-4ae...,0.01,0.01,False


In [11]:
print('In {} transactions, {} are detected as fraud'.format(result.count(), result.filter(F.col('is_fraud') == True).count()))



In 8151584 transactions, 1572 are detected as fraud


                                                                                

In [14]:
result.filter((F.col('merchant_prob') == 50) | (F.col('consumer_prob') == 50)).count()

                                                                                

0

In [13]:
# save the training dataset as parquet
(result.drop('order_id')).write.parquet('../data/curated/transaction_fraud_prob_training.parquet')

                                                                                

## Prediction Models

Since prediction outputs should be discrete labels (True or False for fraud checking), classification models are selected:
1. Naive Bayes Models
2. Logistic Regression
3. Support Vector Machines (SVMs)
4. Clustering Analysis

### Split Dataset

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold

In [4]:
result = spark.read.parquet('../data/curated/transaction_fraud_prob_training.parquet')

                                                                                

## Lois电脑爆炸了 不知道16gb的内存够不够 可以晚点你(如果你电脑是16gb)试试
先跑y 如果y跑不了就更别提x了

In [5]:
# create attributes and labels
#X = np.array(result.select(result.columns[:-1]).collect())
y = np.array(result.select(result.columns[-1]).collect())
y

AttributeError: 'DataFrame' object has no attribute 'values'

In [11]:
import sklearn.naive_bayes as nb
##print(dir(nb))
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=30034)
gnb.fit(X_train, y_train)
acc = gnb.score(X_test, y_test)
print("\nGNB score %f " %acc)

    
mnb.fit(X_train, y_train)
acc = mnb.score(X_test, y_test)
print("MNB score %f " %acc)

    
bnb.fit(X_train, y_train)
acc = bnb.score(X_test, y_test)
print("BNB score %f " %acc)


    

TypeError: Singleton array array(+-------+--------------+------------+------------------+-------------+-------------+
|user_id|order_datetime|merchant_abn|      dollar_value|merchant_prob|consumer_prob|
+-------+--------------+------------+------------------+-------------+-------------+
|  14935|    2021-11-26| 79417999332|136.06570809815838|         0.01|         0.01|
|      1|    2021-11-26| 46451548968| 72.61581642788431|         0.01|         0.01|
|  14936|    2021-11-26| 89518629617|3.0783487174439297|         0.01|         0.01|
|      1|    2021-11-26| 49167531725| 51.58228625503599|         0.01|         0.01|
|  14936|    2021-11-26| 31101120643|25.228114942417797|         0.01|         0.01|
|      2|    2021-11-26| 67978471888| 691.5028234458998|         0.01|         0.01|
|  14936|    2021-11-26| 60956456424|102.13952056640888|         0.01|         0.01|
|      2|    2021-11-26| 47644196714| 644.5220654863093|         0.01|         0.01|
|  14938|    2021-11-26| 39649557865|209.12780951421405|    28.690205|         0.01|
|      3|    2021-11-26| 88402174457| 141.0387993699113|         0.01|         0.01|
|  14938|    2021-11-26| 32234779638|102.04090324888061|         0.01|         0.01|
|      4|    2021-11-26| 24015173965|             157.0|         0.01|         0.01|
|  14939|    2021-11-26| 79830510987|246.13590152514766|         0.01|         0.01|
|      5|    2021-11-26| 76646764782|35.463129852951695|         0.01|         0.01|
|  14939|    2021-11-26| 17324645993|24.108258533478818|         0.01|         0.01|
|      5|    2021-11-26| 71041015148| 89.76800577775657|         0.01|         0.01|
|  14939|    2021-11-26| 75089928159|14.806519329659501|         0.01|         0.01|
|      5|    2021-11-26| 80324045558| 62.79195913018835|         0.01|         0.01|
|  14939|    2021-11-26| 50315283629| 575.3321071340728|    29.088158|         0.01|
|      6|    2021-11-26| 11566786699|15.856637357709419|         0.01|         0.01|
+-------+--------------+------------+------------------+-------------+-------------+
only showing top 20 rows
, dtype=object) cannot be considered a valid collection.