# Data Cleaning
In this session, we will do same basic cleaning steps on the main datasets

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import shapefile as shp
import pandas as pd
import numpy as np
import os

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data Cleaning")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

24/10/05 04:31:07 WARN Utils: Your hostname, codespaces-c6855a resolves to a loopback address: 127.0.0.1; using 10.0.0.128 instead (on interface eth0)
24/10/05 04:31:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 04:31:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load dataset

We have 3 kinds of dataset: transactions, consumer, merchant. We aim to find top 100 merchants we should accept, so, let's look at merchant dataset.

## 1. Merchants

In [3]:
# Information on merchants
merchant = spark.read.parquet("../data/tables/part_1/tbl_merchants.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

In [4]:
print(f"Number of rows: {merchant.count()}")
merchant.limit(5)

Number of rows: 4026


name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [5]:
merchant_fraud_prob.count()

merchant_abn         114
order_datetime       114
fraud_probability    114
dtype: int64

We should look at the data type of each column.

In [6]:
merchant.printSchema()

root
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



In [7]:
merchant_fraud_prob.dtypes

merchant_abn           int64
order_datetime        object
fraud_probability    float64
dtype: object

### Comment
- The merchant dataset contains their names, tags (seems like stuff they sell), and the abn.
- For the probability dataset, they record the datetime of an order and the probability it is broken?.
- Two tables share the `merchant_abn` column.

In [8]:
# Look at the tags column
merchant.select("tags").limit(5).collect()

[Row(tags='((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'),
 Row(tags='([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])'),
 Row(tags='([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])'),
 Row(tags='([wAtch, clock, and jewelry repair shops], [b], [take rate: 3.29])'),
 Row(tags='([music shops - musical instruments, pianos, and sheet music], [a], [take rate: 6.33])')]

In [9]:
# Check duplicate
print(f"Number of duplicates: {merchant.select('merchant_abn').count() - merchant.select('merchant_abn').distinct().count()}")

Number of duplicates: 0


In [10]:
# Check null value
print(f"NaN in merchant detail: {merchant.filter(F.col('merchant_abn').isNull() | F.col('tags').isNull() | F.col('name').isNull()).count()}")
print(f"Nan in merchant fraud rate:\n{merchant_fraud_prob.isna().sum()}")

NaN in merchant detail: 0
Nan in merchant fraud rate:
merchant_abn         0
order_datetime       0
fraud_probability    0
dtype: int64


In [11]:
# Number of merchant with a fraud rate
print(f"Number of merchant with a fraud rate: {len(merchant_fraud_prob.merchant_abn.unique())}")

Number of merchant with a fraud rate: 61


In [12]:
# The abn with fraud but not in  merchant info
abn_not_in_merchant = spark.createDataFrame(pd.DataFrame(merchant_fraud_prob.merchant_abn)).subtract(merchant.select('merchant_abn'))
abn_not_in_merchant.show()

[Stage 20:>                                                         (0 + 4) / 4]

+------------+
|merchant_abn|
+------------+
| 99989036621|
| 82999039227|
| 94311056026|
| 14827550074|
| 23686790459|
| 19010030815|
| 57564805948|
| 59258669983|
| 29674997261|
| 75892370170|
| 83220249221|
| 73052515151|
| 81146325646|
+------------+



                                                                                

In [13]:
print(f"Number of abn not in merchant dataset: {abn_not_in_merchant.count()}")
merchant.filter(F.col("merchant_abn") == 82999039227).show()
merchant_fraud_prob[merchant_fraud_prob.merchant_abn == 82999039227]

Number of abn not in merchant dataset: 13
+----+----+------------+
|name|tags|merchant_abn|
+----+----+------------+
+----+----+------------+



Unnamed: 0,merchant_abn,order_datetime,fraud_probability
3,82999039227,2021-12-19,94.1347


In [14]:
# Time range for fraud probability
merchant_fraud_prob.loc[:,"order_datetime"] = pd.to_datetime(merchant_fraud_prob.order_datetime)
merchant_fraud_prob.order_datetime.sort_values()

49    2021-03-25
33    2021-04-17
47    2021-08-28
46    2021-08-29
109   2021-09-01
         ...    
16    2022-02-17
45    2022-02-19
41    2022-02-20
83    2022-02-25
15    2022-02-27
Name: order_datetime, Length: 114, dtype: datetime64[ns]

The time range recoded in the merchant fraud dataset is from 25/03/2021 to 27/02/2022.

In [15]:
# Check if fraud prob is in a valid range
merchant_fraud_prob.fraud_probability.describe()

count    114.000000
mean      40.419335
std       17.187745
min       18.210891
25%       28.992765
50%       32.692032
75%       48.395260
max       94.134700
Name: fraud_probability, dtype: float64

The fraud probability is in good range (from 0 to 1), with the highest probability at 94%, which was found to belong to an abn not existing in the merchant dataset. 

### Consideration
- We need to separate each tupe into 3 values: stuff_type, ?, take_rate
- How to deal with the text containing goods: remove stop words, dec2vec, count2vec, etc.
- Convert `order_datetime` to datetime data type.

## 2. Consumer

In [16]:
# Information on consumer
consumer_user_detail = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")
consumer = pd.read_csv("../data/tables/part_1/tbl_consumer.csv", delimiter="|")

# Information on consumer's fraud probability
consumer_fraud_prob = pd.read_csv("../data/tables/part_1/consumer_fraud_probability.csv")

In [17]:
print(f"Shapes: {consumer.shape}")
consumer.head(5)

Shapes: (499999, 6)


Unnamed: 0,name,address,state,postcode,gender,consumer_id
0,Yolanda Williams,413 Haney Gardens Apt. 742,WA,6935,Female,1195503
1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
3,Lindsay Jimenez,00653 Davenport Crossroad,NSW,2780,Female,154128
4,Rebecca Blanchard,9271 Michael Manors Suite 651,WA,6355,Female,712975


In [18]:
print(f"Shape: {consumer_fraud_prob.shape}")
consumer_fraud_prob.head(5)

Shape: (34864, 3)


Unnamed: 0,user_id,order_datetime,fraud_probability
0,6228,2021-12-19,97.629808
1,21419,2021-12-10,99.24738
2,5606,2021-10-17,84.05825
3,3101,2021-04-17,91.421921
4,22239,2021-10-19,94.703425


In [19]:
print(f"Number of rows: {consumer_user_detail.count()}")
consumer_user_detail.limit(5)

Number of rows: 499999


user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


### Comment
- The consumer dataset contains their name, address, state, postcode, gender and consumer_id. While the consumer user details dataset contain corresponding consumer_id for each user_id.
- Consumer fraud probability and consumer dataset can be merged through consumer user detail dataset.

In [20]:
# Check duplicate in user_id and consumer_id
print(f"Number of duplicates in user_id: {consumer_user_detail.select('user_id').count() - consumer_user_detail.select('user_id').distinct().count()}")

print(f"Number of duplicates in consumer_id: {consumer_user_detail.select('consumer_id').count() - consumer_user_detail.select('consumer_id').distinct().count()}")

                                                                                

Number of duplicates in user_id: 0
Number of duplicates in consumer_id: 0


**Question**: Why do we need both consumer_id and user_id (related to database)?

An user id has to be in a range of 1 and 499,999, while consumer_id can be random?

In [21]:
# Check null values
print(f"Nan in consumer fraud rate:\n{consumer_fraud_prob.isna().sum()}")
print(f"Nan in consumer fraud rate:\n{consumer.isna().sum()}")

Nan in consumer fraud rate:
user_id              0
order_datetime       0
fraud_probability    0
dtype: int64
Nan in consumer fraud rate:
name           0
address        0
state          0
postcode       0
gender         0
consumer_id    0
dtype: int64


In [22]:
# Check if user id range in other dataframes is valid.
print(f"Max user id in the fraud record: {max(consumer_fraud_prob.user_id)}")
print(f"Min user id in the fraud record: {min(consumer_fraud_prob.user_id)}")

Max user id in the fraud record: 24081
Min user id in the fraud record: 1


In [23]:
# Check for the consumer id without a valid corresponding user id
invalid_consumer_id = spark.createDataFrame(pd.DataFrame(consumer.consumer_id)).subtract(consumer_user_detail.select('consumer_id'))
invalid_consumer_id.limit(1)

consumer_id


There are no problems in consumer_id and user_id.

In [24]:
# Time range for fraud probability
consumer_fraud_prob.loc[:,"order_datetime"] = pd.to_datetime(consumer_fraud_prob.order_datetime)
consumer_fraud_prob.order_datetime.sort_values()

15812   2021-02-28
18284   2021-02-28
3674    2021-02-28
14061   2021-02-28
4787    2021-02-28
           ...    
11970   2022-02-27
5119    2022-02-27
22952   2022-02-27
26151   2022-02-27
14025   2022-02-27
Name: order_datetime, Length: 34864, dtype: datetime64[ns]

In [25]:
# Check if fraud prob is in a valid range
consumer_fraud_prob.fraud_probability.describe()

count    34864.000000
mean        15.120091
std          9.946085
min          8.287144
25%          9.634437
50%         11.735624
75%         16.216158
max         99.247380
Name: fraud_probability, dtype: float64

It has a suitable time range and probability. Overall, we just need to convert order_datetime to datetime data type.

## 3. Transaction

This dataset contains details for each transaction between a merchant and a user.

In [26]:
# Read transaction dataset
transaction1 = spark.read.parquet("../data/tables/part_2/*")
transaction2 = spark.read.parquet("../data/tables/part_3/*")
transaction3 = spark.read.parquet("../data/tables/part_4/*")

                                                                                

In [27]:
# Combine datasets into a single DataFrame
transaction = transaction1.union(transaction2).union(transaction3)
transaction.count() # Number of rows

                                                                                

14195505

In [28]:
transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [29]:
transaction.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


#### 1. Check for missing values

In [30]:
# List of columns to check
cols = transaction.columns

for col_name in cols:
    # Filter rows where values are null and count them
    null_count = transaction.filter(F.col(col_name).isNull()).count()

    if null_count > 0:
        print(f"Number of rows with null {col_name}: {null_count}")

                                                                                

So there are no missing values in any columns.

In [31]:
for col in ['user_id', 'dollar_value', 'order_datetime']:
    print(f"Max of {col}: {transaction.agg({col: 'max'})}")
    print(f"Min of {col}: {transaction.agg({col: 'min'})}")

                                                                                

Max of user_id: +------------+
|max(user_id)|
+------------+
|       24081|
+------------+



                                                                                

Min of user_id: +------------+
|min(user_id)|
+------------+
|           1|
+------------+



                                                                                

Max of dollar_value: +------------------+
| max(dollar_value)|
+------------------+
|105193.88578925544|
+------------------+



                                                                                

Min of dollar_value: +--------------------+
|   min(dollar_value)|
+--------------------+
|9.756658099412162E-8|
+--------------------+

Max of order_datetime: +-------------------+
|max(order_datetime)|
+-------------------+
|         2022-10-26|
+-------------------+

Min of order_datetime: +-------------------+
|min(order_datetime)|
+-------------------+
|         2021-02-28|
+-------------------+



### Comment
- Although we have 499,999 user ids, but only at most 24081 of them made a transaction within the provided time range.
- the `dollar_value` range seems strange as the minimum value is almost 0. We may need to do some outlier analysis for it.
- The time range of the transaction dataset is wider than the time range of other datasets, which can lead to missing data when we joined every table together.

# Create Cleaning Function

At this step, only the column `tags` of merchant dataset needs to be preprocessed. We decide to convert the tbl_merchants.parquet to csv because of its small size.

In [32]:
import sys
sys.path.append('../scripts')
from etl import clean_merchant_df

In [33]:
# Apply clean function
merchant = spark.read.parquet("../data/tables/part_1/tbl_merchants.parquet")
merchant.show()

+--------------------+--------------------+------------+
|                name|                tags|merchant_abn|
+--------------------+--------------------+------------+
|       Felis Limited|((furniture, home...| 10023283211|
|Arcu Ac Orci Corp...|([cable, satellit...| 10142254217|
|    Nunc Sed Company|([jewelry, watch,...| 10165489824|
|Ultricies Digniss...|([wAtch, clock, a...| 10187291046|
| Enim Condimentum PC|([music shops - m...| 10192359162|
|       Fusce Company|[(gift, card, nov...| 10206519221|
|Aliquam Enim Inco...|[(computers, comP...| 10255988167|
|    Ipsum Primis Ltd|[[watch, clock, a...| 10264435225|
|Pede Ultrices Ind...|([computer progra...| 10279061213|
|           Nunc Inc.|[(furniture, home...| 10323485998|
|Facilisis Facilis...|([computers, comp...| 10342410215|
|      Odio Institute|((equipment, tool...| 10346855916|
|    Rutrum Justo Ltd|([music shops - m...| 10364012396|
|   Tellus Foundation|[[artist supply a...| 10385011947|
|      Sed Et Company|([florist

In [34]:
clean_merchant_df(merchant)

In [35]:
cleaned_merchant_df = spark.read.parquet("../data/curated/part_1/tbl_merchants.parquet")

In [36]:
cleaned_merchant_df.show()

+--------------------+------------+--------------------+-------------+---------+
|                name|merchant_abn|               goods|revenue_level|take_rate|
+--------------------+------------+--------------------+-------------+---------+
|       Felis Limited| 10023283211|furniture, home f...|            e|     0.18|
|Arcu Ac Orci Corp...| 10142254217|cable, satellite,...|            b|     4.22|
|    Nunc Sed Company| 10165489824|jewelry, watch, c...|            b|      4.4|
|Ultricies Digniss...| 10187291046|watch, clock, and...|            b|     3.29|
| Enim Condimentum PC| 10192359162|music shops - mus...|            a|     6.33|
|       Fusce Company| 10206519221|gift, card, novel...|            a|     6.34|
|Aliquam Enim Inco...| 10255988167|computers, comput...|            b|     4.32|
|    Ipsum Primis Ltd| 10264435225|watch, clock, and...|            c|     2.39|
|Pede Ultrices Ind...| 10279061213|computer programm...|            a|     5.71|
|           Nunc Inc.| 10323

## Feature Engineering

In [37]:
aggregrated_trans_df = transaction.groupBy("merchant_abn").agg(F.sum("dollar_value"), F.count("dollar_value"))

In [38]:
aggregrated_trans_df.show()



+------------+------------------+-------------------+
|merchant_abn| sum(dollar_value)|count(dollar_value)|
+------------+------------------+-------------------+
| 38700038932|  9546185.36069731|               7132|
| 83412691377| 498536.5816973136|              14288|
| 15613631617|  543030.531332826|               1785|
| 19839532017|          113982.0|                726|
| 73256306726| 1496967.159114219|               5263|
| 35344855546|134737.25046268434|               1522|
| 24406529929|280125.23089766403|               4184|
| 73841664453| 83314.29045077678|                959|
| 78916025936|21015.671086215087|                 67|
| 60654402457|17030.976608608154|                199|
| 92202115241| 39887.58814458104|                120|
| 52763133264|14217.064637334852|                125|
| 48214071373| 162100.6618795422|                540|
| 41956465747|63674.266719753636|                279|
| 56395390867|46747.779659101056|                 47|
| 34440496342| 19425.3588287

                                                                                

Creating a scaled revenue as as feature, we take log of the ratio.

In [39]:
aggregrated_trans_df = aggregrated_trans_df.withColumn("log_ratio", F.log(F.col("sum(dollar_value)") / F.col("count(dollar_value)")))

In [40]:
aggregrated_trans_df.show()



+------------+------------------+-------------------+------------------+
|merchant_abn| sum(dollar_value)|count(dollar_value)|         log_ratio|
+------------+------------------+-------------------+------------------+
| 38700038932|  9546185.36069731|               7132| 7.199305215006568|
| 83412691377| 498536.5816973136|              14288| 3.552256946120572|
| 15613631617|  543030.531332826|               1785| 5.717747130253923|
| 19839532017|          113982.0|                726| 5.056245805348308|
| 73256306726| 1496967.159114219|               5263|5.6504952400078965|
| 35344855546|134737.25046268434|               1522| 4.483301329640816|
| 24406529929|280125.23089766403|               4184| 4.203969029624668|
| 73841664453| 83314.29045077678|                959|  4.46448429258872|
| 78916025936|21015.671086215087|                 67| 5.748331061215046|
| 60654402457|17030.976608608154|                199| 4.449484293653718|
| 92202115241| 39887.58814458104|                12

                                                                                

In [41]:
cleaned_merchant_df = cleaned_merchant_df.join(aggregrated_trans_df, on="merchant_abn", how="left")

In [42]:
cleaned_merchant_df = cleaned_merchant_df.withColumn("unscaled_earning", (F.col("take_rate")/100 * F.col("sum(dollar_value)")))

In [43]:
cleaned_merchant_df.limit(5)

                                                                                

merchant_abn,name,goods,revenue_level,take_rate,sum(dollar_value),count(dollar_value),log_ratio,unscaled_earning
10023283211,Felis Limited,"furniture, home f...",e,0.18,703277.7114509277,3261,5.373717954622449,1265.8998806116697
10142254217,Arcu Ac Orci Corp...,"cable, satellite,...",b,4.22,118356.1460726035,3036,3.663157406405673,4994.629364263868
10165489824,Nunc Sed Company,"jewelry, watch, c...",b,4.4,56180.47385703053,5,9.326886622809967,2471.940849709344
10192359162,Enim Condimentum PC,music shops - mus...,a,6.33,177980.50545638177,385,6.136185969080727,11266.165995388965
10187291046,Ultricies Digniss...,"watch, clock, and...",b,3.29,39693.73038743404,336,4.771837369490879,1305.92372974658


In [44]:
cleaned_merchant_df.write.mode('overwrite').parquet('../data/curated/part_1/clean_merchant.parquet')

                                                                                

In [45]:
cleaned_merchant_df.count()

4026