In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import re
import glob
import math
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, DateType, DoubleType
from datetime import datetime, timedelta

In [3]:
spark = (
    SparkSession.builder.appName('Merchant Data Preprocess')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

24/10/16 16:30:36 WARN Utils: Your hostname, Alans-MacBook-Air-4.local resolves to a loopback address: 127.0.0.1; using 192.168.0.52 instead (on interface en0)
24/10/16 16:30:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/16 16:30:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/16 16:30:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
tbl_merchants = spark.read.parquet(".././data/landing/tables/tbl_merchants.parquet")
tbl_merchants.show(truncate=False)

                                                                                

+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|name                                |tags                                                                                                             |merchant_abn|
+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|Felis Limited                       |((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))|10023283211 |
|Arcu Ac Orci Corporation            |([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])                        |10142254217 |
|Nunc Sed Company                    |([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])                                          |10165489824 |
|Ult

In [6]:
merchant_fraud = spark.read.option("header", True).csv(".././data/landing/tables/merchant_fraud_probability.csv")
merchant_fraud.show()

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 19492220327|    2021-11-28|44.403658647495355|
| 31334588839|    2021-10-02| 42.75530083865367|
| 19492220327|    2021-12-22|38.867790051131095|
| 82999039227|    2021-12-19|  94.1347004808891|
| 90918180829|    2021-09-02| 43.32551731714902|
| 31334588839|    2021-12-26| 38.36165958070444|
| 23686790459|    2021-12-10|  79.4543441508535|
| 14827550074|    2021-11-26| 46.45775596795885|
| 31334588839|    2021-11-26| 36.20971272078342|
| 19492220327|    2021-12-18|33.819672154331755|
| 31334588839|    2021-11-29|35.386213297375505|
| 14827550074|    2021-12-05| 43.85519494291279|
| 19492220327|    2021-11-18|32.193139919494016|
| 93260930990|    2021-11-30| 37.87197154172081|
| 90918180829|    2021-09-16| 36.62001350882694|
| 83199298021|    2022-02-27|26.025158824861773|
| 83199298021|    2022-02-17| 25.77998392496447|
| 94311056026|    20

In [7]:
tbl_merchants.printSchema()

root
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



In [8]:
tbl_merchants = tbl_merchants.withColumn('merchant_abn', F.col('merchant_abn').cast(StringType()))

In [9]:
abn = merchant_fraud.select(F.col('merchant_abn')).distinct()
abn = merchant_fraud.toPandas()
abn = abn['merchant_abn'].tolist()

In [10]:
merchants_abn = tbl_merchants.select(F.col('merchant_abn')).distinct()
merchants_abn = merchants_abn.toPandas()
merchants_abn = merchants_abn['merchant_abn'].tolist()

In [11]:
tbl_merchants.count()

4026

In [12]:
merchant_fraud.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [13]:
print(len(set(abn).difference(merchants_abn)))

13


In [14]:
print(set(abn).difference(merchants_abn))

{'99989036621', '83220249221', '94311056026', '14827550074', '23686790459', '29674997261', '19010030815', '81146325646', '73052515151', '59258669983', '82999039227', '57564805948', '75892370170'}


In [15]:
lst = list(set(abn).difference(merchants_abn))

In [16]:
merchant_fraud = merchant_fraud.filter(~merchant_fraud['merchant_abn'].isin(lst))

In [17]:
merchant_fraud.filter(F.col('merchant_abn') == '82999039227').show()


+------------+--------------+-----------------+
|merchant_abn|order_datetime|fraud_probability|
+------------+--------------+-----------------+
+------------+--------------+-----------------+



In [18]:
tbl_merchants.filter(F.col('merchant_abn') == '82999039227').show()

+----+----+------------+
|name|tags|merchant_abn|
+----+----+------------+
+----+----+------------+



In [19]:
tbl_merchants = tbl_merchants.withColumn('tags', F.regexp_replace('tags', r'\(', r'\[')) \
    .withColumn('tags', F.lower(F.regexp_replace('tags', r'\)', r'\]')))


tbl_merchants = tbl_merchants.withColumn('tags1', (F.regexp_extract('tags', r'\[\[([^\]]*)\][^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=1)))
tbl_merchants = tbl_merchants.withColumn('tags2', (F.regexp_extract('tags', r'\[\[([^\]]*)\][^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=2)))
tbl_merchants = tbl_merchants.withColumn('tags3', (F.regexp_extract('tags', r'\[\[([^\]]*)\][^\[]*\[([^\]]*)\][^\[]*\[take rate: ([^\]]*)\]\]', idx=3)).cast(DoubleType()))

tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), "\\s+", " "))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), ", except appliances", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), "rent al", "rental"))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), ":", ","))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " -", ","))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " shops", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " services", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " service", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " and", ","))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), ", sales", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), " ,", ","))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), ",,", ","))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), ",]", "]"))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), r"\[", ""))
tbl_merchants = tbl_merchants.withColumn("tags1", F.regexp_replace(F.col("tags1"), r"\]", ""))

tbl_merchants = tbl_merchants.withColumnRenamed('tags3', 'take_rate')
tbl_merchants = tbl_merchants.withColumnRenamed('tags2', 'type')
tbl_merchants = tbl_merchants.withColumnRenamed("name", "merchant_name")
tbl_merchants = tbl_merchants.drop('tags')
tbl_merchants = tbl_merchants.withColumnRenamed('tags1', 'tags')

tbl_merchants.show(truncate=False)

+------------------------------------+------------+----------------------------------------------------------------+----+---------+
|merchant_name                       |merchant_abn|tags                                                            |type|take_rate|
+------------------------------------+------------+----------------------------------------------------------------+----+---------+
|Felis Limited                       |10023283211 |furniture, home furnishings, equipment, manufacturers           |e   |0.18     |
|Arcu Ac Orci Corporation            |10142254217 |cable, satellite, other pay television, radio                   |b   |4.22     |
|Nunc Sed Company                    |10165489824 |jewelry, watch, clock, silverware                               |b   |4.4      |
|Ultricies Dignissim Lacus Foundation|10187291046 |watch, clock, jewelry repair                                    |b   |3.29     |
|Enim Condimentum PC                 |10192359162 |music, musical instrument

Check Range for Merchant Fraud

In [20]:
merchant_fraud.filter(F.col('fraud_probability') < 0).count()

0

In [21]:
merchant_fraud.filter(F.col('fraud_probability') >= 100).count()

0

Check for null values

In [22]:
merchant_fraud.filter(F.isnull('merchant_abn')).count()


0

In [23]:
merchant_fraud.filter(F.isnull('fraud_probability')).count()

0

In [24]:
merchant_fraud.filter(F.isnull('order_datetime')).count()

0

In [25]:
tbl_merchants.filter(F.isnull('merchant_abn')).count()

0

In [26]:
tbl_merchants.filter(F.isnull('tags')).count()

0

In [27]:
tbl_merchants.filter(F.isnull('name')).count()

0

In [28]:
merchant_fraud = merchant_fraud.withColumn('order_datetime', F.col('order_datetime').cast('date'))

There are no null values

In [29]:
merchant_fraud.write.mode('overwrite').parquet('.././data/curated/merchant_fraud')

In [30]:
tbl_merchants.write.mode('overwrite').parquet('.././data/curated/tbl_merchants')

                                                                                