## Descriptives

In [1]:
# Open Spark session 
from pyspark.sql import SparkSession, functions as F


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/10/03 23:17:25 WARN Utils: Your hostname, Chaitanyas-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.12.175.233 instead (on interface en0)
22/10/03 23:17:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/03 23:17:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read the files 
transactions_sample = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')
consumer_details = spark.read.parquet('../data/tables/consumer_user_details.parquet')
merchants_tbl = spark.read.parquet('../data/tables/tbl_merchants.parquet')
customer_tbl = spark.read.option("delimiter", "|").option("header",True).csv('../data/tables/tbl_consumer.csv')

                                                                                

In [3]:
merchants = merchants_tbl.toPandas()

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
import string
import re

# This is used to for NLP of the description
def text_process(text):
    stemmer = WordNetLemmatizer()
    # Remove all punctuation and numbers 
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    # Remove all stopwords
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    # lemmatize and output
    return ' '.join([stemmer.lemmatize(word) for word in nopunc])

In [5]:
# this function standardises the tags attribute, creating a list with the 'description', 'revenue band' and 'BNPL service charge'
nltk.download('stopwords')
nltk.download('wordnet')
def tag_extract(tag_string): 
    # first need to preprocess
    string =  re.sub('\[','(', tag_string.lower())
    string = re.sub('\]',')', string)
    # break the string into sections
    string_cut = string.split('),')
    new_string = []
    # first extract the description and pre process
    descr = str(string_cut[0].strip('(('))
    #descr = re.sub(r'[-|,|_|:]', ' ', descr)
    #descr = re.sub(r' +', ' ', descr)
    #descr = descr.split()
    #descr = ' '.join([word for word in descr if word not in stop_words])
    new_string.append(text_process(descr))
    # second extract the band
    new_string.append(str(re.search(r'[a-z]',string_cut[1]).group()))
    # finally the take rate
    new_string.append(float(re.search(r'[0-9]+\.[0-9]+',string_cut[2]).group()))
    return(new_string)
################
# now we can run the algorithm
tags = merchants['tags']
processed_tags = []
for i in tags:
    processed_tags.append(tag_extract(i))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chaitanyaraghuvanshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chaitanyaraghuvanshi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import pandas as pd
merchant_tbl = pd.DataFrame(processed_tags, columns=('Description', 'Earnings_Class', 'BNPL_Fee'))
merchant_tbl = pd.concat([merchants, merchant_tbl], axis=1)
# drop the tags column 
merchant_tbl.drop(columns='tags', inplace=True)

In [7]:
set(merchant_tbl['Description'])

{'antique shop sale repair restoration service',
 'art dealer gallery',
 'artist supply craft shop',
 'bicycle shop sale service',
 'book periodical newspaper',
 'cable satellite pay television radio service',
 'computer computer peripheral equipment software',
 'computer programming data processing integrated system design service',
 'digital good book movie music',
 'equipment tool furniture appliance rent al leasing',
 'florist supply nursery stock flower',
 'furniture home furnishing equipment shop manufacturer except appliance',
 'gift card novelty souvenir shop',
 'health beauty spa',
 'hobby toy game shop',
 'jewelry watch clock silverware shop',
 'lawn garden supply outlet including nursery',
 'motor vehicle supply new part',
 'music shop musical instrument piano sheet music',
 'optician optical good eyeglass',
 'shoe shop',
 'stationery office supply printing writing paper',
 'telecom',
 'tent awning shop',
 'watch clock jewelry repair shop'}

In [8]:
merchant_tbl.head(5)

Unnamed: 0,name,merchant_abn,Description,Earnings_Class,BNPL_Fee
0,Felis Limited,10023283211,furniture home furnishing equipment shop manuf...,e,0.18
1,Arcu Ac Orci Corporation,10142254217,cable satellite pay television radio service,b,4.22
2,Nunc Sed Company,10165489824,jewelry watch clock silverware shop,b,4.4
3,Ultricies Dignissim Lacus Foundation,10187291046,watch clock jewelry repair shop,b,3.29
4,Enim Condimentum PC,10192359162,music shop musical instrument piano sheet music,a,6.33


In [9]:
merchants_tbl = spark.createDataFrame(merchant_tbl)
customer_tbl = customer_tbl.join(consumer_details, ['consumer_id'])

full_dataset = transactions_sample.join(customer_tbl, ['user_id'])
merchants_tbl = merchants_tbl.withColumnRenamed('name','company_name')
full_dataset = full_dataset.join(merchants_tbl, ['merchant_abn'])

In [10]:
full_dataset.limit(5)

                                                                                

merchant_abn,user_id,dollar_value,order_id,order_datetime,consumer_id,name,address,state,postcode,gender,company_name,Description,Earnings_Class,BNPL_Fee
33064796871,7,373.0873675184212,fe188788-b89f-4dd...,2021-08-20,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Curabitur Massa C...,computer programm...,b,3.75
68435002949,7,232.5364986739752,b4a89891-a113-45e...,2021-08-20,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Aliquam Eu Inc.,artist supply cra...,a,6.65
41944909975,7,30.91075523023432,302ae628-8eba-4a5...,2021-08-20,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Et Nunc Consulting,book periodical n...,e,0.16
21439773999,7,91.18655746114226,4524fdc9-73f0-477...,2021-08-21,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Mauris Non Institute,cable satellite p...,a,6.1
86662713230,7,38.8137172956379,28f9e0f3-858d-445...,2021-08-19,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Vestibulum Accums...,watch clock jewel...,a,6.41


In [11]:
fraud_tbl = spark.read.option("header", True).csv("../data/tables/consumer_fraud_probability.csv")
fraud_tbl.limit(5)

user_id,order_datetime,fraud_probability
6228,2021-12-19,97.6298077657765
21419,2021-12-10,99.24738020302328
5606,2021-10-17,84.05825045251777
3101,2021-04-17,91.42192091901347
22239,2021-10-19,94.70342477508036


In [12]:
full_dataset.createOrReplaceTempView('data')
fraud_tbl.createOrReplaceTempView('fraud')
dataset = spark.sql("""
select Description, avg(fraud_probability) as description_avg from data, fraud where (fraud.user_id == data.user_id) AND (fraud.order_datetime == data.order_datetime)
group by Description""")

In [13]:
dataset.limit(5)

                                                                                

Description,description_avg
lawn garden suppl...,13.634435258725222
bicycle shop sale...,12.218922129625165
antique shop sale...,21.534525172534373
book periodical n...,13.672269357461053
artist supply cra...,13.693647162978351


In [14]:
from pyspark.sql.functions import *
full_dataset = full_dataset.withColumn('day_of_week', dayofweek('order_datetime'))

In [15]:
full_dataset.createOrReplaceTempView('data')
dataset.createOrReplaceTempView('fraud')
dataset = spark.sql("""
select * from data, fraud where fraud.Description == data.Description""")

In [16]:
dataset.limit(5)

                                                                                

merchant_abn,user_id,dollar_value,order_id,order_datetime,consumer_id,name,address,state,postcode,gender,company_name,Description,Earnings_Class,BNPL_Fee,day_of_week,Description.1,description_avg
82298341051,7,54.8519479237856,248066a0-5a60-4ee...,2021-04-30,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Eros Institute,lawn garden suppl...,b,4.73,6,lawn garden suppl...,13.634435258725222
70009327857,7,720.8860425094242,e4d0d7c4-9fcb-4e6...,2021-05-06,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Torquent Per Inc.,lawn garden suppl...,c,2.45,5,lawn garden suppl...,13.634435258725222
43127814599,7,1477.9174337803072,265eace6-10d3-42e...,2021-03-24,511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,Nam Ligula Elit F...,lawn garden suppl...,b,3.58,4,lawn garden suppl...,13.634435258725222
42355028515,19,302.3144253397087,a2965af7-cb14-426...,2021-08-18,1226530,Victoria Gonzalez,68657 Johnson Gle...,TAS,7276,Female,Eu Inc.,lawn garden suppl...,a,5.97,4,lawn garden suppl...,13.634435258725222
43127814599,19,987.99321041603,b815e072-7d31-4d9...,2021-07-18,1226530,Victoria Gonzalez,68657 Johnson Gle...,TAS,7276,Female,Nam Ligula Elit F...,lawn garden suppl...,b,3.58,1,lawn garden suppl...,13.634435258725222
