In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from pyspark.sql.functions import *
from pyspark.sql.types import DateType
from pyspark.sql import SparkSession, DataFrame

In [2]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

In [3]:
sp = (
    SparkSession.builder.appName("Model")
    .config("spark.sql.session.timeZone", "+11")
    .config("spark.driver.memory", "10g")
    .config("spark.executor.memory", "10g")
    .config('spark.sql.parquet.cacheMetadata', 'True')
    .getOrCreate()
)
sp

22/10/08 12:22:10 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.28.113.244 instead (on interface eth0)
22/10/08 12:22:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/08 12:22:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
transactions = sp.read.option("inferSchema", True).parquet("../data/processed/transactions")
merchants = sp.read.option("inferSchema", True).parquet("../data/processed/merchants")
customers = sp.read.option("inferSchema", True).parquet("../data/processed/customers")

                                                                                

In [5]:
transactions.show(1)
merchants.show(1)
customers.head(1)

+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|order_id|user_id|merchant_abn|dollar_value|order_datetime|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|       3|      3| 60956456424|      136.68|    2021-08-20|          0|                0|      0|        20|    8|        6|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
only showing top 1 row

+------------+-------------+--------------+--------+----+---------------+---------------+----------------+-----------------+
|merchant_abn|         name|Earnings_Class|BNPL_Fee|tags|avg_monthly_inc|monthly_entropy|postcode_entropy|          revenue|
+------------+-------------+--------------+--------+----+---------------+---------------+------------

[Row(state='ACT', postcode=200, gender='Female', user_id=71674, Number of individuals lodging an income tax return=5524, Average taxable income or loss=66722, Median taxable income or loss=52958, Proportion with salary or wages=1, Count salary or wages=5009, Average salary or wages=64930, Median salary or wages=55579, Proportion with net rent=1, Count net rent=762, Average net rent=-4289, Median net rent=-2448, Average total income or loss=68991, Median total income or loss=54988, Average total deductions=2244, Median total deductions=872, Proportion with total business income=1, Count total business income=382, Average total business income=56170, Median total business income=18742, Proportion with total business expenses=1, Count total business expenses=343, Average total business expenses=42645, Median total business expenses=8664, Proportion with net tax=1, Count net tax=4586, Average net tax=18805, Median net tax=11482, Count super total accounts balance=7620, Average super total 

### PROCESSING CUSTOMER FRAUD DATA

In [6]:
c_fraud = sp.read.option("inferSchema", True).parquet("../data/curated/customer_fraud")
c_fraud = c_fraud.withColumn("order_datetime", col("order_datetime").cast(DateType()))
c_fraud.show(2)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19|         97.62981|
|  21419|    2021-12-10|         99.24738|
+-------+--------------+-----------------+
only showing top 2 rows



In [7]:
c_fraud_full = transactions.join(c_fraud, on=["user_id", "order_datetime"])
c_fraud_full.show(2)

+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
|user_id|order_datetime|order_id|merchant_abn|dollar_value|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|fraud_probability|
+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
|    448|    2021-08-20|    1005| 94380689142|     6263.03|          0|                0|      0|        20|    8|        6|        14.681704|
|   3116|    2021-08-20|    6989| 22248828825|     3958.86|          0|                0|      0|        20|    8|        6|         8.809071|
+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
only showing top 2 rows



In [8]:
c_fraud_full.count()

                                                                                

80560

In [9]:
X = c_fraud_full.join(merchants, on="merchant_abn").join(customers, on="user_id")
X.head(1)

[Row(user_id=448, merchant_abn=94380689142, order_datetime=datetime.date(2021, 8, 20), order_id=1005, dollar_value=6263.02978515625, Natural_var=0, Potential_Outlier=0, holiday=0, dayofmonth=20, month=8, dayofweek=6, fraud_probability=14.681703567504883, name='Aliquet Ltd', Earnings_Class='b', BNPL_Fee=3.77, tags=12, avg_monthly_inc=0.0, monthly_entropy=2.710181474685669, postcode_entropy=4.060055732727051, revenue=241562.580078125, state='WA', postcode=6170, gender='Female', Number of individuals lodging an income tax return=4994, Average taxable income or loss=56564, Median taxable income or loss=44772, Proportion with salary or wages=1, Count salary or wages=3916, Average salary or wages=57393, Median salary or wages=49510, Proportion with net rent=1, Count net rent=690, Average net rent=863, Median net rent=255, Average total income or loss=59730, Median total income or loss=47123, Average total deductions=2865, Median total deductions=598, Proportion with total business income=1, 

### Dropping Columns

In [10]:
X = X.drop("user_id", "merchant_abn", "order_datetime", "order_id", "name", "postcode", "holiday")
X.printSchema()

root
 |-- dollar_value: float (nullable = true)
 |-- Natural_var: integer (nullable = true)
 |-- Potential_Outlier: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- fraud_probability: float (nullable = true)
 |-- Earnings_Class: string (nullable = true)
 |-- BNPL_Fee: double (nullable = true)
 |-- tags: integer (nullable = true)
 |-- avg_monthly_inc: float (nullable = true)
 |-- monthly_entropy: float (nullable = true)
 |-- postcode_entropy: float (nullable = true)
 |-- revenue: double (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Number of individuals lodging an income tax return: long (nullable = true)
 |-- Average taxable income or loss: long (nullable = true)
 |-- Median taxable income or loss: long (nullable = true)
 |-- Proportion with salary or wages: long (nullable = true)
 |-- Count salary or wages: long (nullable = true)


Categorical
- holiday (done)
- dayofmonth ?
- dayofweek
- month (done)
- tags
- state
- gender
- postcode

In [11]:
def category_processing(data: DataFrame, outcome: str):
    categories = [
        "dayofmonth",
        "dayofweek",
        "month",
        "tags",
        "state",
        "gender",
        "Earnings_Class"
    ]

    # Pipeline
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index") for c in categories]
    encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_encoded") for c in categories]
    transformed = Pipeline(stages=indexers + encoders).fit(data).transform(data)

    for c in categories:
        transformed = transformed.drop(c).drop(c+"_index")
    return transformed

In [12]:
category_processed = category_processing(X, "outcome")
category_processed.head(1)

                                                                                

[Row(dollar_value=6263.02978515625, Natural_var=0, Potential_Outlier=0, fraud_probability=14.681703567504883, BNPL_Fee=3.77, avg_monthly_inc=0.0, monthly_entropy=2.710181474685669, postcode_entropy=4.060055732727051, revenue=241562.580078125, Number of individuals lodging an income tax return=4994, Average taxable income or loss=56564, Median taxable income or loss=44772, Proportion with salary or wages=1, Count salary or wages=3916, Average salary or wages=57393, Median salary or wages=49510, Proportion with net rent=1, Count net rent=690, Average net rent=863, Median net rent=255, Average total income or loss=59730, Median total income or loss=47123, Average total deductions=2865, Median total deductions=598, Proportion with total business income=1, Count total business income=457, Average total business income=93034, Median total business income=32873, Proportion with total business expenses=1, Count total business expenses=436, Average total business expenses=76035, Median total bu

## TRAIN TEST SPLIT

In [13]:
from pyspark.ml.feature import Bucketizer

buckets = Bucketizer(splits=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], inputCol="fraud_probability", outputCol="fraud_buckets")
X_bucks = buckets.transform(category_processed).drop("fraud_probability")

X_bucks.head(1)

                                                                                

[Row(dollar_value=6263.02978515625, Natural_var=0, Potential_Outlier=0, BNPL_Fee=3.77, avg_monthly_inc=0.0, monthly_entropy=2.710181474685669, postcode_entropy=4.060055732727051, revenue=241562.580078125, Number of individuals lodging an income tax return=4994, Average taxable income or loss=56564, Median taxable income or loss=44772, Proportion with salary or wages=1, Count salary or wages=3916, Average salary or wages=57393, Median salary or wages=49510, Proportion with net rent=1, Count net rent=690, Average net rent=863, Median net rent=255, Average total income or loss=59730, Median total income or loss=47123, Average total deductions=2865, Median total deductions=598, Proportion with total business income=1, Count total business income=457, Average total business income=93034, Median total business income=32873, Proportion with total business expenses=1, Count total business expenses=436, Average total business expenses=76035, Median total business expenses=20422, Proportion with

In [13]:
X_bucks.groupBy("fraud_buckets").count().orderBy("fraud_buckets").show()



+-------------+-----+
|fraud_buckets|count|
+-------------+-----+
|          0.0|22923|
|          1.0|38611|
|          2.0| 6113|
|          3.0| 1934|
|          4.0|  990|
|          5.0|  576|
|          6.0|  360|
|          7.0|  193|
|          8.0|  102|
|          9.0|   11|
+-------------+-----+



                                                                                

In [14]:
from functools import reduce

In [15]:

fractions = [0, 0, 0, 2, 4, 7, 15, 20, 35, 250]

X_adjusted = reduce(
    DataFrame.unionAll,
    [X_bucks.filter(X_bucks.fraud_buckets == float(x)).sample(withReplacement=True, fraction=float(fractions[x]), seed=69) for x in range(3, 10)]
)
X_adjusted = reduce(
    DataFrame.unionAll,
    [X_adjusted] + [X_bucks.filter(X_bucks.fraud_buckets == float(x)) for x in range(0, 3)]
)

X_adjusted.count()

                                                                                

95171

In [16]:
X_adjusted.groupBy("fraud_buckets").count().orderBy("fraud_buckets").show()



+-------------+-----+
|fraud_buckets|count|
+-------------+-----+
|          0.0|22923|
|          1.0|38611|
|          2.0| 6113|
|          3.0| 3910|
|          4.0| 3931|
|          5.0| 4025|
|          6.0| 5450|
|          7.0| 3876|
|          8.0| 3583|
|          9.0| 2749|
+-------------+-----+



                                                                                

In [19]:
train, val, test = X_adjusted.randomSplit([0.7, 0.2, 0.1], seed=69)

print(train.count())
print(val.count())
test.count()

22/10/08 12:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB


                                                                                

66654
22/10/08 12:16:58 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB


                                                                                

18834
22/10/08 12:17:35 WARN DAGScheduler: Broadcasting large task binary with size 1318.4 KiB


                                                                                

9683

In [20]:
def process_numerical(data: DataFrame):
    """
    Function to scale and process numerical columns
    """
    # Scaler
    columns = ['dollar_value', 'avg_monthly_inc', 'BNPL_Fee',
    'monthly_entropy', 'postcode_entropy', 'revenue', 'Number of individuals lodging an income tax return', 
    'Average taxable income or loss', 'Median taxable income or loss', 'Proportion with salary or wages', 'Count salary or wages', 
    'Average salary or wages', 'Median salary or wages', 'Proportion with net rent', 'Count net rent', 'Average net rent', 
    'Median net rent', 'Average total income or loss', 'Median total income or loss', 'Average total deductions', 
    'Median total deductions', 'Proportion with total business income', 'Count total business income', 
    'Average total business income', 'Median total business income', 'Proportion with total business expenses', 
    'Count total business expenses', 'Average total business expenses', 'Median total business expenses', 
    'Proportion with net tax', 'Count net tax', 'Average net tax', 'Median net tax', 'Count super total accounts balance', 
    'Average super total accounts balance', 'Median super total accounts balance']

    va = VectorAssembler(inputCols=columns, outputCol="to_scale")
    sc = StandardScaler(inputCol="to_scale", outputCol="scaled")

    va_data = va.transform(data)
    data = sc.fit(va_data).transform(va_data)
    
    # Drop other columns
    for c in columns:
        data = data.drop(c)
    return data.drop("to_scale")

In [21]:
train_processed = process_numerical(train)
val_processed = process_numerical(val)
test_processed = process_numerical(test)

train_processed.head(1)
val_processed.head(1)
test_processed.head(1)

22/10/08 12:18:31 WARN DAGScheduler: Broadcasting large task binary with size 1365.0 KiB


                                                                                

22/10/08 12:19:19 WARN DAGScheduler: Broadcasting large task binary with size 1365.2 KiB


                                                                                

22/10/08 12:19:36 WARN DAGScheduler: Broadcasting large task binary with size 1365.2 KiB


                                                                                

22/10/08 12:19:54 WARN DAGScheduler: Broadcasting large task binary with size 1428.4 KiB
22/10/08 12:19:56 WARN DAGScheduler: Broadcasting large task binary with size 1428.4 KiB
22/10/08 12:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1428.4 KiB


[Row(Natural_var=0, Potential_Outlier=0, dayofmonth_encoded=SparseVector(30, {13: 1.0}), dayofweek_encoded=SparseVector(6, {5: 1.0}), month_encoded=SparseVector(11, {0: 1.0}), tags_encoded=SparseVector(24, {11: 1.0}), state_encoded=SparseVector(7, {2: 1.0}), gender_encoded=SparseVector(2, {1: 1.0}), Earnings_Class_encoded=SparseVector(4, {1: 1.0}), fraud_buckets=3.0, scaled=DenseVector([0.0006, -1.4757, 2.6213, 11.8446, 5.6498, 2.6769, 0.4524, 3.4492, 5.3173, 0.0, 0.4319, 5.017, 5.9294, 44.0159, 0.3734, -0.0793, -0.2302, 3.3795, 5.4852, 1.2359, 2.6072, 37.2041, 0.4346, 1.7786, 2.4723, 31.132, 0.4411, 1.2678, 1.3093, 0.0, 0.446, 2.0525, 3.3059, 0.4536, 2.5189, 3.4209]))]

In [4]:
train_processed = sp.read.option("inferSchema", True).parquet("../models/train_processed")
val_processed = sp.read.option("inferSchema", True).parquet("../models/val_processed")
test_processed = sp.read.option("inferSchema", True).parquet("../models/test_processed")

                                                                                

In [5]:
def vectorize(data: DataFrame, outcome: str):
    """
    Function to vectorize all the processed data
    """
    data = data.withColumnRenamed(outcome, "label")
    return VectorAssembler(
        inputCols= [c for c in data.drop("label").columns],
        outputCol="features"
    ).transform(data)

In [6]:
train_vector = vectorize(train_processed, "fraud_buckets")
val_vector = vectorize(val_processed, "fraud_buckets")
test_vector = vectorize(test_processed, "fraud_buckets")

train_vector.head(1)
val_vector.head(1)
test_vector.head(1)

                                                                                

[Row(Natural_var=0, Potential_Outlier=0, dayofmonth_encoded=SparseVector(30, {6: 1.0}), dayofweek_encoded=SparseVector(6, {5: 1.0}), month_encoded=SparseVector(11, {5: 1.0}), tags_encoded=SparseVector(24, {17: 1.0}), state_encoded=SparseVector(7, {0: 1.0}), gender_encoded=SparseVector(2, {1: 1.0}), Earnings_Class_encoded=SparseVector(4, {2: 1.0}), label=1.0, scaled=DenseVector([0.0001, 0.01, 1.1552, 11.8533, 5.5901, 0.3913, 1.0881, 4.0182, 6.3873, 0.0, 1.0838, 6.1235, 7.2708, 44.0159, 1.1971, 0.061, -0.0291, 3.9179, 6.5207, 1.2957, 3.8719, 37.2041, 0.9587, 1.9516, 1.9602, 31.132, 0.9613, 1.4767, 0.9168, 0.0, 1.0796, 2.6103, 4.6455, 1.1213, 2.7123, 4.0691]), features=SparseVector(122, {8: 1.0, 37: 1.0, 43: 1.0, 66: 1.0, 73: 1.0, 81: 1.0, 84: 1.0, 86: 0.0001, 87: 0.01, 88: 1.1552, 89: 11.8533, 90: 5.5901, 91: 0.3913, 92: 1.0881, 93: 4.0182, 94: 6.3873, 96: 1.0838, 97: 6.1235, 98: 7.2708, 99: 44.0159, 100: 1.1971, 101: 0.061, 102: -0.0291, 103: 3.9179, 104: 6.5207, 105: 1.2957, 106: 3.871

In [7]:
train_vector.select("features", "label").write.parquet("../models/train_vector", mode="overwrite")
val_vector.select("features", "label").write.parquet("../models/val_vector", mode="overwrite")
test_vector.select("features", "label").write.parquet("../models/test_vector", mode="overwrite")

                                                                                

### MODEL

In [4]:
train_vector = sp.read.option("inferSchema", True).parquet("../models/train_vector/")
val_vector = sp.read.option("inferSchema", True).parquet("../models/val_vector/")
test_vector = sp.read.option("inferSchema", True).parquet("../models/test_vector/")

                                                                                

In [32]:
inputCount = 122                            # Seen from sparse vector column
layers = [122, 256, 64, 10]
model = MultilayerPerceptronClassifier(
    labelCol='label',
    featuresCol='features',
    solver='gd',
    maxIter=200,
    layers=layers,
    blockSize=64,
    seed=69)

In [33]:
model_fit = model.fit(train_vector.select("features", "label").dropna())

                                                                                

In [34]:
# train_output = model_fit.transform(train_vector)
val_output = model_fit.transform(val_vector.dropna())
test_output = model_fit.transform(test_vector.dropna())

In [35]:
# metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']
metrics = ["accuracy"]
for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(
        val_output.select("prediction", "label"))))



Train accuracy = 0.4044281618349793


                                                                                

In [51]:
# metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']
metrics = ["accuracy", "weightedFalsePositiveRate"]
for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(
        test_output.select("prediction", "label"))))

Train accuracy = 0.3971909532169782
Train weightedFalsePositiveRate = 0.3971909532169782


                                                                                

In [42]:
val_output.select("label", "rawPrediction", "prediction").show()

+-----+--------------------+----------+
|label|       rawPrediction|prediction|
+-----+--------------------+----------+
|  1.0|[1.21844183743563...|       1.0|
|  1.0|[1.27374537812617...|       1.0|
|  1.0|[1.25855328047493...|       1.0|
|  1.0|[1.26524142233720...|       1.0|
|  1.0|[1.25961303094817...|       1.0|
|  1.0|[1.25153014281734...|       1.0|
|  1.0|[1.26243904864357...|       1.0|
|  1.0|[1.26507362918823...|       1.0|
|  1.0|[1.26690124398601...|       1.0|
|  1.0|[1.26907109442438...|       1.0|
|  1.0|[1.26396004739482...|       1.0|
|  1.0|[1.24820426333042...|       1.0|
|  1.0|[1.27094470413560...|       1.0|
|  1.0|[1.26930180670012...|       1.0|
|  1.0|[1.26141111182877...|       1.0|
|  1.0|[1.25830855450558...|       1.0|
|  1.0|[1.24511597187381...|       1.0|
|  1.0|[1.26306202361971...|       1.0|
|  1.0|[1.27913547137144...|       1.0|
|  1.0|[1.26010923848763...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [None]:
from pyspark.sql.types import IntegerType

@udf(returnType=IntegerType())
def upper_prediction(pred):
    if pred < 8:
        return pred + 1
    return pred

In [52]:
mean_square_error_val_score = val_output.select("label", "prediction").withColumn("MSE" , (col("label") - col("prediction")))
mean_square_error_val_score.show(5)

+-----+----------+---+
|label|prediction|MSE|
+-----+----------+---+
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
+-----+----------+---+
only showing top 5 rows



In [53]:
mean_square_error_val_score.groupBy("MSE").count().show()



+----+-----+
| MSE|count|
+----+-----+
| 0.0| 7617|
|-1.0| 4518|
| 1.0| 1176|
| 3.0|  793|
| 2.0|  791|
| 4.0|  851|
| 5.0| 1104|
| 7.0|  705|
| 6.0|  742|
| 8.0|  537|
+----+-----+



                                                                                

In [48]:
mean_square_error_test_score = test_output.select("label", "prediction").withColumn("MSE" , ((col("label") - col("prediction")) ** 2) ** 0.5)
mean_square_error_test_score.show(5)

+-----+----------+---+
|label|prediction|MSE|
+-----+----------+---+
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
|  1.0|       1.0|0.0|
+-----+----------+---+
only showing top 5 rows



In [49]:
mean_square_error_test_score.groupBy("MSE").count().show()

+---+-----+
|MSE|count|
+---+-----+
|0.0| 3846|
|1.0| 3013|
|4.0|  418|
|3.0|  448|
|2.0|  369|
|5.0|  563|
|7.0|  389|
|6.0|  391|
|8.0|  246|
+---+-----+



                                                                                