In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import re
import datetime

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import DateType

In [2]:
sp = (
    SparkSession.builder.appName("Model")
    .config("spark.sql.session.timeZone", "+11")
    .getOrCreate()
)

22/10/05 18:48:03 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.18.71.108 instead (on interface eth0)
22/10/05 18:48:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 18:48:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions = sp.read.option("inferSchema", True).parquet("../data/processed/transactions")
merchants = sp.read.option("inferSchema", True).parquet("../data/processed/merchants")
customers = sp.read.option("inferSchema", True).parquet("../data/processed/customers")

                                                                                

In [9]:
transactions.show(1)
merchants.show(1)
customers.show(1)

+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|order_id|user_id|merchant_abn|dollar_value|order_datetime|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|       3|      3| 60956456424|      136.68|    2021-08-20|          0|                0|      0|        20|    8|        6|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
only showing top 1 row

+------------+-----------------+--------------------+---------------+---------------+----------------+-----------------+
|merchant_abn|             name|                tags|avg_monthly_inc|monthly_entropy|postcode_entropy|          revenue|
+------------+-----------------+--------------------+---------------+---------------+----------------+-------

In [10]:
final = transactions.join(merchants, on="merchant_abn").join(customers, on="user_id")
final.show(2)

+-------+------------+--------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+--------------------+--------------------+---------------+---------------+----------------+------------------+-----+--------+------+--------------------------------------------------+------------------------------+-----------------------------+-------------------------------+---------------------+-----------------------+----------------------+------------------------+--------------+----------------+---------------+----------------------------+---------------------------+------------------------+-----------------------+-------------------------------------+---------------------------+-----------------------------+----------------------------+---------------------------------------+-----------------------------+-------------------------------+------------------------------+-----------------------+-------------+---------------+--------------+-------------------------

In [11]:
final.count()

13614648

### Dropping Columns

In [12]:
final.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- order_id: long (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- Natural_var: integer (nullable = true)
 |-- Potential_Outlier: integer (nullable = true)
 |-- holiday: long (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- avg_monthly_inc: float (nullable = true)
 |-- monthly_entropy: float (nullable = true)
 |-- postcode_entropy: float (nullable = true)
 |-- revenue: double (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- Number of individuals lodging an income tax return: long (nullable = true)
 |-- Average taxable income or loss: long (nullable = true)
 |-- Median taxable income or loss

In [13]:
final = final.drop("user_id", "merchant_abn", "order_id", "order_datetime", "name")
final.printSchema()

root
 |-- dollar_value: float (nullable = true)
 |-- Natural_var: integer (nullable = true)
 |-- Potential_Outlier: integer (nullable = true)
 |-- holiday: long (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- avg_monthly_inc: float (nullable = true)
 |-- monthly_entropy: float (nullable = true)
 |-- postcode_entropy: float (nullable = true)
 |-- revenue: double (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- Number of individuals lodging an income tax return: long (nullable = true)
 |-- Average taxable income or loss: long (nullable = true)
 |-- Median taxable income or loss: long (nullable = true)
 |-- Proportion with salary or wages: long (nullable = true)
 |-- Count salary or wages: long (nullable = true)
 |-- Average salary or wages: long (nullable = true)
 |-- Me

In [14]:
final.show(1)

                                                                                

+------------+-----------+-----------------+-------+----------+-----+---------+--------------------+---------------+---------------+----------------+-----------------+-----+--------+------+--------------------------------------------------+------------------------------+-----------------------------+-------------------------------+---------------------+-----------------------+----------------------+------------------------+--------------+----------------+---------------+----------------------------+---------------------------+------------------------+-----------------------+-------------------------------------+---------------------------+-----------------------------+----------------------------+---------------------------------------+-----------------------------+-------------------------------+------------------------------+-----------------------+-------------+---------------+--------------+----------------------------------+------------------------------------+--------------------

### PROCESSING CUSTOMER FRAUD DATA

In [16]:
c_fraud = sp.read.option("inferSchema", True).parquet("../data/curated/customer_fraud")
c_fraud = c_fraud.withColumn("order_datetime", col("order_datetime").cast(DateType()))
c_fraud.show(2)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19|         97.62981|
|  21419|    2021-12-10|         99.24738|
+-------+--------------+-----------------+
only showing top 2 rows



In [17]:
transactions.show(2)

+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|order_id|user_id|merchant_abn|dollar_value|order_datetime|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|       3|      3| 60956456424|      136.68|    2021-08-20|          0|                0|      0|        20|    8|        6|
|       8|  18482| 70501974849|       68.75|    2021-08-20|          0|                0|      0|        20|    8|        6|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
only showing top 2 rows



In [18]:
c_fraud_full = transactions.join(c_fraud, on=["user_id", "order_datetime"])
c_fraud_full.show(2)

+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
|user_id|order_datetime|order_id|merchant_abn|dollar_value|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|fraud_probability|
+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
|    448|    2021-08-20|    1005| 94380689142|     6263.03|          0|                0|      0|        20|    8|        6|        14.681704|
|   3116|    2021-08-20|    6989| 22248828825|     3958.86|          0|                0|      0|        20|    8|        6|         8.809071|
+-------+--------------+--------+------------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+
only showing top 2 rows



In [19]:
c_fraud_full.count()

                                                                                

80560

In [22]:
X = c_fraud_full.join(merchants, on="merchant_abn").join(customers, on="user_id")
X.show(1)

                                                                                

+-------+------------+--------------+--------+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+-----------+--------------------+---------------+---------------+----------------+----------------+-----+--------+------+--------------------------------------------------+------------------------------+-----------------------------+-------------------------------+---------------------+-----------------------+----------------------+------------------------+--------------+----------------+---------------+----------------------------+---------------------------+------------------------+-----------------------+-------------------------------------+---------------------------+-----------------------------+----------------------------+---------------------------------------+-----------------------------+-------------------------------+------------------------------+-----------------------+-------------+---------------+--------------+------------------

In [26]:
X = X.drop("user_id", "merchant_abn", "order_datetime", "order_id", "name")
X.printSchema()

root
 |-- dollar_value: float (nullable = true)
 |-- Natural_var: integer (nullable = true)
 |-- Potential_Outlier: integer (nullable = true)
 |-- holiday: long (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- fraud_probability: float (nullable = true)
 |-- tags: string (nullable = true)
 |-- avg_monthly_inc: float (nullable = true)
 |-- monthly_entropy: float (nullable = true)
 |-- postcode_entropy: float (nullable = true)
 |-- revenue: double (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- Number of individuals lodging an income tax return: long (nullable = true)
 |-- Average taxable income or loss: long (nullable = true)
 |-- Median taxable income or loss: long (nullable = true)
 |-- Proportion with salary or wages: long (nullable = true)
 |-- Count salary or wages: long (nullable = true)
 |-- Average

In [25]:
X.show(1)

+------------+-----------+-----------------+-------+----------+-----+---------+-----------------+--------------------+---------------+---------------+----------------+----------------+-----+--------+------+--------------------------------------------------+------------------------------+-----------------------------+-------------------------------+---------------------+-----------------------+----------------------+------------------------+--------------+----------------+---------------+----------------------------+---------------------------+------------------------+-----------------------+-------------------------------------+---------------------------+-----------------------------+----------------------------+---------------------------------------+-----------------------------+-------------------------------+------------------------------+-----------------------+-------------+---------------+--------------+----------------------------------+------------------------------------+---

Categorical
- holiday (done)
- dayofmonth ?
- dayofweek
- month (done)
- tags
- state
- gender
- postcode

In [27]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.regression import GeneralizedLinearRegression, GBTRegressor
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

def category_processing(data: DataFrame, outcome: str):
    categories = [
        "dayofmonth"
        "dayofweek",
        "month",
        "tags",
        "state",
        "gender",
        "postcode"
    ]

    # Pipeline
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index") for c in categories]
    encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_encoded") for c in categories]
    transformed = Pipeline(stages=indexers + encoders).fit(data).transform(data)

    for c in categories:
        transformed = transformed.drop(c).drop(c+"_index")
    return transformed

In [29]:
category_processing(X, "outcome").show(2)

Py4JJavaError: An error occurred while calling o116.fit.
: org.apache.spark.SparkException: Input column dayofmonthdayofweek does not exist.
	at org.apache.spark.ml.feature.StringIndexerBase.$anonfun$validateAndTransformSchema$2(StringIndexer.scala:128)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
	at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema(StringIndexer.scala:123)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema$(StringIndexer.scala:115)
	at org.apache.spark.ml.feature.StringIndexer.validateAndTransformSchema(StringIndexer.scala:145)
	at org.apache.spark.ml.feature.StringIndexer.transformSchema(StringIndexer.scala:252)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:71)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:237)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:145)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
