In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import re
import glob
import math
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
from datetime import datetime, timedelta
from collections import defaultdict
from multiprocessing import Manager

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "7g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .appName('exploration') \
    .getOrCreate()
# spark.conf.set("spark.sql.session.timeZone", "America/New_York")


24/08/28 19:29:00 WARN Utils: Your hostname, Alans-MacBook-Air-4.local resolves to a loopback address: 127.0.0.1; using 192.168.0.52 instead (on interface en0)
24/08/28 19:29:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/28 19:29:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
transactions1 = spark.read.parquet("data/transactions/transactions_20210228_20210827_snapshot")
transactions2 = spark.read.parquet("data/transactions/transactions_20210828_20220227_snapshot")
transactions3 = spark.read.parquet("data/transactions/transactions_20220228_20220828_snapshot")

transactions = transactions1.union(transactions2).union(transactions3)
transactions.show()

                                                                                

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|
|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|
|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|
|      3| 76819856970|  448.529684285612|5ace6a24-cdf0-4aa...|    2021-08-20|
|  18479| 67609108741|  86.4040605836911|d0e180f0-cb06-42a...|    2021-08-20|
|      3| 34096466752| 301.5793450525113|6fb1ff48-24bb-4f9...|    2021-08-20|
|  18482| 70501974849| 68.75486276223054|8505fb33-b69a-412...|    2021-08-20|
|      4| 49891706470| 48.89796461900801|ed11e477-b09f-4ae...|  

In [3]:
transactions_df = transactions.groupby("order_datetime").count().sort("order_datetime").to_pandas_on_spark()
transactions_df.plot.line("order_datetime", "count")

                                                                                

In [5]:
consumer_fraud = spark.read.option("header", True).csv("data/tables/consumer_fraud_probability.csv")
consumer_fraud.show()

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19| 97.6298077657765|
|  21419|    2021-12-10|99.24738020302328|
|   5606|    2021-10-17|84.05825045251777|
|   3101|    2021-04-17|91.42192091901347|
|  22239|    2021-10-19|94.70342477508035|
|  16556|    2022-02-20|89.65663294494827|
|  10278|    2021-09-28|83.59136689427714|
|  15790|    2021-12-30|71.77065889280253|
|   5233|    2021-08-29|85.87123303878818|
|    230|    2021-08-28|86.28328808934151|
|  13601|    2021-12-26|83.13696487489679|
|   6383|    2021-09-15| 66.2676451623754|
|   3513|    2022-02-27|75.16981192247916|
|  18658|    2021-10-19|82.98609082999361|
|   5965|    2021-11-14|69.37164467869053|
|  18714|    2021-11-14|83.78813794627237|
|  22957|    2022-02-12|82.79065699075498|
|  20118|    2021-09-05|80.34030486265003|
|   6436|    2021-12-24|84.81618344606828|
|  17900|    2022-02-25|92.73262811161372|
+-------+--

In [6]:
consumer_details = spark.read.parquet("data/tables/consumer_user_details.parquet")
consumer_details.show()

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
|      6|     407340|
|      7|     511685|
|      8|     448088|
|      9|     650435|
|     10|    1058499|
|     11|     428325|
|     12|    1494640|
|     13|    1146717|
|     14|    1343547|
|     15|    1463076|
|     16|    1356405|
|     17|    1331093|
|     18|      80965|
|     19|    1226530|
|     20|    1390367|
+-------+-----------+
only showing top 20 rows



In [7]:
merchant_fraud = spark.read.option("header", True).csv("data/tables/merchant_fraud_probability.csv")
merchant_fraud.show()

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 19492220327|    2021-11-28|44.403658647495355|
| 31334588839|    2021-10-02| 42.75530083865367|
| 19492220327|    2021-12-22|38.867790051131095|
| 82999039227|    2021-12-19|  94.1347004808891|
| 90918180829|    2021-09-02| 43.32551731714902|
| 31334588839|    2021-12-26| 38.36165958070444|
| 23686790459|    2021-12-10|  79.4543441508535|
| 14827550074|    2021-11-26| 46.45775596795885|
| 31334588839|    2021-11-26| 36.20971272078342|
| 19492220327|    2021-12-18|33.819672154331755|
| 31334588839|    2021-11-29|35.386213297375505|
| 14827550074|    2021-12-05| 43.85519494291279|
| 19492220327|    2021-11-18|32.193139919494016|
| 93260930990|    2021-11-30| 37.87197154172081|
| 90918180829|    2021-09-16| 36.62001350882694|
| 83199298021|    2022-02-27|26.025158824861773|
| 83199298021|    2022-02-17| 25.77998392496447|
| 94311056026|    20

In [10]:
tbl_consumer = spark.read.option("header", True).option("delimiter", "|").csv("data/tables/tbl_consumer.csv")
tbl_consumer.show(truncate=False)

+-----------------+--------------------+-----+--------+-----------+-----------+
|             name|             address|state|postcode|     gender|consumer_id|
+-----------------+--------------------+-----+--------+-----------+-----------+
| Yolanda Williams|413 Haney Gardens...|   WA|    6935|     Female|    1195503|
|       Mary Smith|     3764 Amber Oval|  NSW|    2782|     Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|   NT|     862|     Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|  NSW|    2780|     Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|   WA|    6355|     Female|     712975|
|    Karen Chapman|2706 Stewart Oval...|  NSW|    2033|     Female|     407340|
|     Andrea Jones|   122 Brandon Cliff|  QLD|    4606|     Female|     511685|
| Stephen Williams|6804 Wright Crest...|   WA|    6056|       Male|     448088|
|  Stephanie Reyes|5813 Denise Land ...|  NSW|    2482|     Female|     650435|
| Jillian Gonzales|461 Ryan Common S...|

In [11]:
tbl_merchants = spark.read.parquet("data/tables/tbl_merchants.parquet")
tbl_merchants.show()

+--------------------+--------------------+------------+
|                name|                tags|merchant_abn|
+--------------------+--------------------+------------+
|       Felis Limited|((furniture, home...| 10023283211|
|Arcu Ac Orci Corp...|([cable, satellit...| 10142254217|
|    Nunc Sed Company|([jewelry, watch,...| 10165489824|
|Ultricies Digniss...|([wAtch, clock, a...| 10187291046|
| Enim Condimentum PC|([music shops - m...| 10192359162|
|       Fusce Company|[(gift, card, nov...| 10206519221|
|Aliquam Enim Inco...|[(computers, comP...| 10255988167|
|    Ipsum Primis Ltd|[[watch, clock, a...| 10264435225|
|Pede Ultrices Ind...|([computer progra...| 10279061213|
|           Nunc Inc.|[(furniture, home...| 10323485998|
|Facilisis Facilis...|([computers, comp...| 10342410215|
|      Odio Institute|((equipment, tool...| 10346855916|
|    Rutrum Justo Ltd|([music shops - m...| 10364012396|
|   Tellus Foundation|[[artist supply a...| 10385011947|
|      Sed Et Company|([florist