# --- TO DELETE ---

In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [2]:
sp = SparkSession.builder.appName("Legit").getOrCreate()
sp

22/10/04 22:55:29 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.18.71.108 instead (on interface eth0)
22/10/04 22:55:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 22:55:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/04 22:55:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from functools import reduce
from pyspark.sql import DataFrame

def read_tables(sp: SparkSession, file: str, ftype = "p", sample=False):
    """
    Helper function to read data from the desginated folder

    sp : Current SparkSession
    file : Type of data/Name of file name to be read
            (if file is "transactions" no ftype needed)
    ftype : File type (Parquet(p) or CSV(c))

    returns DataFrame read
    """
    # Root directory
    dir = "../data/tables/"
    
    # Transaction folders
    if file == "transactions":
        # Read all transactions together
        groups = [
            "transactions_20210228_20210827_snapshot/",
            "transactions_20210828_20220227_snapshot/",
            "transactions_20220228_20220828_snapshot/"
            ]

        # Read the different transaction folders
        final_list = []
        for g in groups:
            final_list.append(sp.read.option("inferSchema", True).parquet(dir + g))

        if not sample:
            return reduce(DataFrame.unionAll, final_list)
        return reduce(DataFrame.unionAll, final_list).sample(0.01)

    # Special file
    elif file == "tbl_consumer":
        return sp.read.option("inferSchema", True).option("header", True).option("delimiter", "|").csv("../data/tables/tbl_consumer.csv")

    # Parquet files
    if ftype == "p":
        return sp.read.option("inferSchema", True).parquet(dir + file + ".parquet")
    elif ftype == "c":
        return sp.read.option("inferSchema", True).option("header", True).csv(dir + file + ".csv")

In [4]:
trans = read_tables(sp, "transactions")
trans.show(2)

                                                                                

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 2 rows



In [5]:
trans.count()

                                                                                

14195505

In [6]:
trans.select("user_id").distinct().count()

                                                                                

24081

There are 24k unique users in the transaction data

In [7]:
c_fraud = read_tables(sp, "consumer_fraud_probability", "c")
c_fraud.show(2)

+-------+-------------------+-----------------+
|user_id|     order_datetime|fraud_probability|
+-------+-------------------+-----------------+
|   6228|2021-12-19 00:00:00| 97.6298077657765|
|  21419|2021-12-10 00:00:00|99.24738020302328|
+-------+-------------------+-----------------+
only showing top 2 rows



In [8]:
lookup = read_tables(sp, "consumer_user_details", "p")
lookup.show(2)

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
+-------+-----------+
only showing top 2 rows



In [9]:
c_fraud.join(lookup, on="user_id").show(2)

+-------+-------------------+-----------------+-----------+
|user_id|     order_datetime|fraud_probability|consumer_id|
+-------+-------------------+-----------------+-----------+
|      1|2022-02-20 00:00:00|9.805431136520959|    1195503|
|      2|2021-08-30 00:00:00|9.599513915425788|     179208|
+-------+-------------------+-----------------+-----------+
only showing top 2 rows



In [17]:
c_fraud.count()

34864

In [18]:
c_fraud.join(trans.select("user_id").distinct(), on="user_id").count()

                                                                                

34864

In [20]:
consumer_details = read_tables(sp, "tbl_consumer", "csv")
consumer_details.show(2)

+----------------+--------------------+-----+--------+------+-----------+
|            name|             address|state|postcode|gender|consumer_id|
+----------------+--------------------+-----+--------+------+-----------+
|Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|      Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
+----------------+--------------------+-----+--------+------+-----------+
only showing top 2 rows



In [21]:
consumer_details.select("consumer_id").distinct().count()

499999

In [24]:
lookup.select("consumer_id").distinct().count()

499999

In [25]:
consumer_details = consumer_details.join(lookup, on="consumer_id")
consumer_details.show(2)

+-----------+----------------+--------------------+-----+--------+------+-------+
|consumer_id|            name|             address|state|postcode|gender|user_id|
+-----------+----------------+--------------------+-----+--------+------+-------+
|    1195503|Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|      1|
|     179208|      Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|      2|
+-----------+----------------+--------------------+-----+--------+------+-------+
only showing top 2 rows



In [27]:
joined_trans = trans.select("user_id", "dollar_value").join(consumer_details, on="user_id")
joined_trans.show(2)



+-------+-----------------+-----------+------------+-----------------+-----+--------+------+
|user_id|     dollar_value|consumer_id|        name|          address|state|postcode|gender|
+-------+-----------------+-----------+------------+-----------------+-----+--------+------+
|      7|373.0873675184212|     511685|Andrea Jones|122 Brandon Cliff|  QLD|    4606|Female|
|      7|232.5364986739752|     511685|Andrea Jones|122 Brandon Cliff|  QLD|    4606|Female|
+-------+-----------------+-----------+------------+-----------------+-----+--------+------+
only showing top 2 rows



                                                                                

In [30]:
joined_trans.dropna().select("user_id").distinct().count()

                                                                                

24081

In [29]:
joined_trans.count()

                                                                                

14195505