# Data Cleaning
In this session, we will do same basic cleaning steps on the main datasets

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import shapefile as shp
import pandas as pd
import numpy as np
import os

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Data Cleaning")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

# Load dataset

We have 3 kinds of dataset: transactions, consumer, merchant. We aim to find top 100 merchants we should accept, so, let's look at merchant dataset.

## 1. Merchants

In [None]:
# Information on merchants
merchant = spark.read.parquet("../data/tables/part_1/tbl_merchants.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

In [None]:
print(f"Number of rows: {merchant.count()}")
merchant.limit(5)

In [None]:
merchant_fraud_prob.count()

We should look at the data type of each column.

In [None]:
merchant.printSchema()

In [None]:
merchant_fraud_prob.dtypes

### Comment
- The merchant dataset contains their names, tags (seems like stuff they sell), and the abn.
- For the probability dataset, they record the datetime of an order and the probability it is broken?.
- Two tables share the `merchant_abn` column.

In [None]:
# Look at the tags column
merchant.select("tags").limit(5).collect()

In [None]:
# Check duplicate
print(f"Number of duplicates: {merchant.select('merchant_abn').count() - merchant.select('merchant_abn').distinct().count()}")

In [None]:
# Check null value
print(f"NaN in merchant detail: {merchant.filter(F.col('merchant_abn').isNull() | F.col('tags').isNull() | F.col('name').isNull()).count()}")
print(f"Nan in merchant fraud rate:\n{merchant_fraud_prob.isna().sum()}")

In [None]:
# Number of merchant with a fraud rate
print(f"Number of merchant with a fraud rate: {len(merchant_fraud_prob.merchant_abn.unique())}")

In [None]:
# The abn with fraud but not in  merchant info
abn_not_in_merchant = spark.createDataFrame(pd.DataFrame(merchant_fraud_prob.merchant_abn)).subtract(merchant.select('merchant_abn'))
abn_not_in_merchant.show()

In [None]:
print(f"Number of abn not in merchant dataset: {abn_not_in_merchant.count()}")
merchant.filter(F.col("merchant_abn") == 82999039227).show()
merchant_fraud_prob[merchant_fraud_prob.merchant_abn == 82999039227]

In [None]:
# Time range for fraud probability
merchant_fraud_prob.loc[:,"order_datetime"] = pd.to_datetime(merchant_fraud_prob.order_datetime)
merchant_fraud_prob.order_datetime.sort_values()

The time range recoded in the merchant fraud dataset is from 25/03/2021 to 27/02/2022.

In [None]:
# Check if fraud prob is in a valid range
merchant_fraud_prob.fraud_probability.describe()

The fraud probability is in good range (from 0 to 1), with the highest probability at 94%, which was found to belong to an abn not existing in the merchant dataset. 

### Consideration
- We need to separate each tupe into 3 values: stuff_type, ?, take_rate
- How to deal with the text containing goods: remove stop words, dec2vec, count2vec, etc.
- Convert `order_datetime` to datetime data type.

## 2. Consumer

In [16]:
# Information on consumer
consumer_user_detail = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")
consumer = pd.read_csv("../data/tables/part_1/tbl_consumer.csv", delimiter="|")

# Information on consumer's fraud probability
consumer_fraud_prob = pd.read_csv("../data/tables/part_1/consumer_fraud_probability.csv")

In [None]:
print(f"Shapes: {consumer.shape}")
consumer.head(5)

In [None]:
print(f"Shape: {consumer_fraud_prob.shape}")
consumer_fraud_prob.head(5)

In [None]:
print(f"Number of rows: {consumer_user_detail.count()}")
consumer_user_detail.limit(5)

### Comment
- The consumer dataset contains their name, address, state, postcode, gender and consumer_id. While the consumer user details dataset contain corresponding consumer_id for each user_id.
- Consumer fraud probability and consumer dataset can be merged through consumer user detail dataset.

In [None]:
# Check duplicate in user_id and consumer_id
print(f"Number of duplicates in user_id: {consumer_user_detail.select('user_id').count() - consumer_user_detail.select('user_id').distinct().count()}")

print(f"Number of duplicates in consumer_id: {consumer_user_detail.select('consumer_id').count() - consumer_user_detail.select('consumer_id').distinct().count()}")

**Question**: Why do we need both consumer_id and user_id (related to database)?

An user id has to be in a range of 1 and 499,999, while consumer_id can be random?

In [None]:
# Check null values
print(f"Nan in consumer fraud rate:\n{consumer_fraud_prob.isna().sum()}")
print(f"Nan in consumer fraud rate:\n{consumer.isna().sum()}")

In [None]:
# Check if user id range in other dataframes is valid.
print(f"Max user id in the fraud record: {max(consumer_fraud_prob.user_id)}")
print(f"Min user id in the fraud record: {min(consumer_fraud_prob.user_id)}")

In [None]:
# Check for the consumer id without a valid corresponding user id
invalid_consumer_id = spark.createDataFrame(pd.DataFrame(consumer.consumer_id)).subtract(consumer_user_detail.select('consumer_id'))
invalid_consumer_id.limit(1)

There are no problems in consumer_id and user_id.

In [None]:
# Time range for fraud probability
consumer_fraud_prob.loc[:,"order_datetime"] = pd.to_datetime(consumer_fraud_prob.order_datetime)
consumer_fraud_prob.order_datetime.sort_values()

In [None]:
# Check if fraud prob is in a valid range
consumer_fraud_prob.fraud_probability.describe()

It has a suitable time range and probability. Overall, we just need to convert order_datetime to datetime data type.

## 3. Transaction

This dataset contains details for each transaction between a merchant and a user.

In [None]:
# Read transaction dataset
transaction1 = spark.read.parquet("../data/tables/part_2/*")
transaction2 = spark.read.parquet("../data/tables/part_3/*")
transaction3 = spark.read.parquet("../data/tables/part_4/*")

In [None]:
# Combine datasets into a single DataFrame
transaction = transaction1.union(transaction2).union(transaction3)
transaction.count() # Number of rows

In [None]:
transaction.printSchema()

In [None]:
transaction.limit(5)

#### 1. Check for missing values

In [None]:
# List of columns to check
cols = transaction.columns

for col_name in cols:
    # Filter rows where values are null and count them
    null_count = transaction.filter(F.col(col_name).isNull()).count()

    if null_count > 0:
        print(f"Number of rows with null {col_name}: {null_count}")

So there are no missing values in any columns.

In [None]:
for col in ['user_id', 'dollar_value', 'order_datetime']:
    print(f"Max of {col}: {transaction.agg({col: 'max'})}")
    print(f"Min of {col}: {transaction.agg({col: 'min'})}")

### Comment
- Although we have 499,999 user ids, but only at most 24081 of them made a transaction within the provided time range.
- the `dollar_value` range seems strange as the minimum value is almost 0. We may need to do some outlier analysis for it.
- The time range of the transaction dataset is wider than the time range of other datasets, which can lead to missing data when we joined every table together.

# Create Cleaning Function

At this step, only the column `tags` of merchant dataset needs to be preprocessed. We decide to convert the tbl_merchants.parquet to csv because of its small size.

In [32]:
import sys
sys.path.append('../scripts')
from etl import clean_merchant_df

In [None]:
# Apply clean function
merchant = spark.read.parquet("../data/tables/part_1/tbl_merchants.parquet")
merchant.show()

In [35]:
clean_merchant_df(merchant)

In [36]:
cleaned_merchant_df = spark.read.parquet("../data/curated/part_1/tbl_merchants.parquet")

In [None]:
cleaned_merchant_df.show()

## Feature Engineering

In [38]:
aggregrated_trans_df = transaction.groupBy("merchant_abn").agg(F.sum("dollar_value"), F.count("dollar_value"))

In [None]:
aggregrated_trans_df.show()

Creating a scaled revenue as as feature, we take log of the ratio.

In [40]:
aggregrated_trans_df = aggregrated_trans_df.withColumn("log_ratio", F.log(F.col("sum(dollar_value)") / F.col("count(dollar_value)")))

In [None]:
aggregrated_trans_df.show()

In [42]:
cleaned_merchant_df = cleaned_merchant_df.join(aggregrated_trans_df, on="merchant_abn", how="left")

In [43]:
cleaned_merchant_df = cleaned_merchant_df.withColumn("unscaled_earning", (F.col("take_rate")/100 * F.col("sum(dollar_value)")))

In [None]:
cleaned_merchant_df.limit(5)

In [None]:
cleaned_merchant_df.write.mode('overwrite').parquet('../data/curated/part_1/clean_merchant.parquet')

In [None]:
cleaned_merchant_df.count()