# Data Transformation

This notebook involves the One-Hot Encoding and hashing on categorical data, and convert the `order_datetime` to timestamps.

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
from pyspark.sql.functions import split
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import col, when, split, udf
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.feature import HashingTF, StringIndexer

In [2]:
spark = (
    SparkSession.builder.appName("Transformation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory", "8G")
    .config("spark.executor.memory", "8G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 200)
    .getOrCreate()
)

24/09/19 13:45:36 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.12.210.64 instead (on interface en0)
24/09/19 13:45:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/19 13:45:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read datasets

In [3]:
# read datasets
consumer_full = spark.read.parquet('../data/curated/consumer_full')
merchant_full = spark.read.parquet('../data/curated/merchant_full')

                                                                                

In [4]:
consumer_full.show(5)

+--------+-------+--------------+-----------+------------------+-----+------+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|              name|state|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+------------------+-----+------+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+---------------------

In [5]:
consumer_full.printSchema()

root
 |-- postcode: integer (nullable = true)
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- average_population: double (nullable = true)
 |-- avg_age_persons: double (nullable = true)
 |-- avg_mortgage_repay_monthly: double (nullable = true)
 |-- avg_tot_prsnl_inc_weekly: double (nullable = true)
 |-- avg_rent_weekly: double (nullable = true)
 |-- avg_tot_fam_inc_weekly: double (nullable = true)
 |-- avg_num_psns_per_bedroom: double (nullable = true)
 |-- avg_tot_hhd_inc_weekly: double (nullable = true)
 |-- avg_household_size: double (nullable = true)
 |-- avg_unemployment_rate: double (nullable = true)



In [6]:
merchant_full.show(5)

+--------------------+------------+--------------+-------+------------------+-------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|            order_id|merchant_abn|order_datetime|user_id|      dollar_value|               name|revenue_level|take_rate|  processed_category|fraud_probability|transaction_revenue|      BNPL_revenue|
+--------------------+------------+--------------+-------+------------------+-------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|0000043e-7a3f-410...| 98973094975|    2021-05-20|   3663|231.41970775401066|  ornare fusce inc.|            a|     5.98|game shop hobby t...|             NULL|  217.5808014617149|13.838898567829645|
|000063c2-e78b-448...| 15903176024|    2021-12-12|  15072|227.79073108694692| adipiscing elit pc|            c|     2.34|nursery lawn supp...|             NULL| 222.46043632145808| 5.330302911920205|


In [7]:
merchant_full.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- user_id: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- name: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- processed_category: string (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- transaction_revenue: double (nullable = true)
 |-- BNPL_revenue: double (nullable = true)



# Consumer data transformation

### Apply One-Hot Encoding on `state` column

In [8]:
states = consumer_full.select("state").distinct().rdd.flatMap(lambda x: x).collect()

for state in states:
    consumer_full = consumer_full.withColumn(f"state_{state}", when(col("state") == state, 1).otherwise(0))

consumer_full_expanded = consumer_full.drop("state")
consumer_full_expanded.show(5)

                                                                                

+--------+-------+--------------+-----------+------------------+------+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+
|postcode|user_id|order_datetime|consumer_id|              name|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|
+--------+-------+--------------+-----------+------------------+------+------------+------------------+--------------------+------

### Apply One-Hot Encoding on `gender` column

In [9]:
genders = consumer_full.select("gender").distinct().rdd.flatMap(lambda x: x).collect()

for gender in genders:
    consumer_full = consumer_full.withColumn(
        f"gender_{gender}", when(col("gender") == gender, 1).otherwise(0)
    )

consumer_full_expanded = consumer_full.drop("gender")
consumer_full_expanded.show(5)

+--------+-------+--------------+-----------+------------------+-----+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+
|postcode|user_id|order_datetime|consumer_id|              name|state|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_Male|
+--------+-------+--------------+---------

### Convert `order_datetime` column to timestamp data type

In [10]:
consumer_full_expanded = consumer_full_expanded.withColumn(
    "order_timestamp", unix_timestamp(col("order_datetime"))
)
consumer_full_expanded.show(5, truncate=False)

+--------+-------+--------------+-----------+------------------+-----+------------+------------------+------------------------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+
|postcode|user_id|order_datetime|consumer_id|name              |state|merchant_abn|dollar_value      |order_id                            |fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_

### Apply Feature Hashing on `postcode` column

In [11]:
# index the postcode column to convert it into a numeric type
indexer = StringIndexer(inputCol="postcode", outputCol="postcode_index")
consumer_full_expanded_indexed = indexer.fit(consumer_full_expanded).transform(consumer_full_expanded)

# convert the indexed column to an array of strings
def to_array(value):
    return [str(value)]

to_array_udf = udf(to_array, ArrayType(StringType()))

consumer_full_expanded_indexed = consumer_full_expanded_indexed.withColumn("postcode_array", to_array_udf(col("postcode_index")))

# apply HashingTF to transform the array of strings
hashingTF = HashingTF(inputCol="postcode_array", outputCol="hashed_postcode", numFeatures=6000)
consumer_full_expanded_hashed = hashingTF.transform(consumer_full_expanded_indexed)

consumer_full_expanded = consumer_full_expanded_hashed.drop('postcode_index', 'postcode_array')
consumer_full_expanded.show(5, truncate=False)

                                                                                

+--------+-------+--------------+-----------+------------------+-----+------------+------------------+------------------------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+-------------------+
|postcode|user_id|order_datetime|consumer_id|name              |state|merchant_abn|dollar_value      |order_id                            |fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|g

### Check the shape of transformed dataset

In [12]:
num_rows = consumer_full_expanded.count()
print(f"Number of rows: {num_rows}")

num_columns = len(consumer_full_expanded.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 11372745
Number of columns: 33


Check nulls:

In [13]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in consumer_full_expanded.columns}

# use agg() to calculate null counts for each column
null_counts_df = consumer_full_expanded.agg(*null_count_dict.values())
null_counts_df.show()



+--------+-------+--------------+-----------+----+-----+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+---------------+
|postcode|user_id|order_datetime|consumer_id|name|state|merchant_abn|dollar_value|order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_Male|order_timestamp|hashed_postcode|
+--------+-------+--------------+---------

                                                                                

In [14]:
# save as a parquet file
consumer_full_expanded.write.parquet('../data/curated/consumer_full_expanded', mode='overwrite')

                                                                                

# Merchant data transformation

Select helpful columons:

In [15]:
merchant_full = merchant_full.drop('user_id', 'name')
merchant_full.show(5)

+--------------------+------------+--------------+------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|            order_id|merchant_abn|order_datetime|      dollar_value|revenue_level|take_rate|  processed_category|fraud_probability|transaction_revenue|      BNPL_revenue|
+--------------------+------------+--------------+------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|0000043e-7a3f-410...| 98973094975|    2021-05-20|231.41970775401066|            a|     5.98|game shop hobby t...|             NULL|  217.5808014617149|13.838898567829645|
|000063c2-e78b-448...| 15903176024|    2021-12-12|227.79073108694692|            c|     2.34|nursery lawn supp...|             NULL| 222.46043632145808| 5.330302911920205|
|00006b44-0557-4b7...| 74764807084|    2022-09-15| 173.2625432035097|            a|     5.53|           shop shoe|             NULL|  163.68

### Apply One-Hot Encoding on `revenue_level` column

In [16]:
revenue_levels = merchant_full.select("revenue_level").distinct().rdd.flatMap(lambda x: x).collect()

for level in revenue_levels:
    merchant_full = merchant_full.withColumn(f"revenue_level_{level}", when(col("revenue_level") == level, 1).otherwise(0))

merchant_full_expanded = merchant_full.drop("revenue_level")
merchant_full_expanded.show(5)

                                                                                

+--------------------+------------+--------------+------------------+---------+--------------------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+
|            order_id|merchant_abn|order_datetime|      dollar_value|take_rate|  processed_category|fraud_probability|transaction_revenue|      BNPL_revenue|revenue_level_e|revenue_level_d|revenue_level_c|revenue_level_b|revenue_level_a|
+--------------------+------------+--------------+------------------+---------+--------------------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+
|0000043e-7a3f-410...| 98973094975|    2021-05-20|231.41970775401066|     5.98|game shop hobby t...|             NULL|  217.5808014617149|13.838898567829645|              0|              0|              0|              0|              1|
|000063c2-e78b-448...| 15903176024|    2021-12-1

### Apply One-Hot Encoding on `category` column

In [17]:
# collect distinct words and create a dictionary with indices
distinct_words = merchant_full_expanded.selectExpr("explode(split(processed_category, ' ')) as word")\
                                       .distinct()\
                                       .rdd.flatMap(lambda x: x)\
                                       .collect()
word_index = {word: idx for idx, word in enumerate(distinct_words)}

# create a dictionary mapping indices to words
index_word = {idx: word for word, idx in word_index.items()}

# convert the `word_index` dictionary to a broadcast variable
word_index_broadcast = spark.sparkContext.broadcast(word_index)

                                                                                

Define a funciton to convert a list of words into a binary vector based on a predefined word index dictionary:

In [18]:
def words_to_binary_vector(words):
    # retrieve the broadcasted `word_index` dictionary which maps words 
    # to their respective indices
    word_index = word_index_broadcast.value
    
    # initialize a binary vector with zeros, with length equal 
    # to the size of the `word_index` dictionary
    vector = [0] * len(word_index)
    
    # iterate over each word in the input list
    for word in words:
        # if the word is in the `word_index` dictionary, set the corresponding 
        # index in the vector to 1
        if word in word_index:
            vector[word_index[word]] = 1
    
    return vector

words_to_binary_vector_udf = udf(words_to_binary_vector, ArrayType(IntegerType()))

In [19]:
# split `processed_category` into words
merchant_full_expanded = merchant_full_expanded.withColumn("category_words", 
                                                           split(col("processed_category"), " "))

# apply the UDF to get the binary vector
merchant_full_expanded = merchant_full_expanded.withColumn("category_binary", 
                                                           words_to_binary_vector_udf(col("category_words")))

# drop the original columns
merchant_full_expanded = merchant_full_expanded.drop("processed_category", "category_words")

# show the result
merchant_full_expanded.show(5, truncate=False)

+------------------------------------+------------+--------------+------------------+---------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|order_id                            |merchant_abn|order_datetime|dollar_value      |take_rate|fraud_probability|transaction_revenue|BNPL_revenue      |revenue_level_e|revenue_level_d|revenue_level_c|revenue_level_b|revenue_level_a|category_binary                                                                                                                                                                                                                                                         

In [20]:
sample_vector = merchant_full_expanded.select("category_binary").first()[0]
count_sub_category = len(sample_vector)
print(f'Number of vectors in category_binary:', count_sub_category)

Number of vectors in category_binary: 90


In [21]:
# split `category_binary` column into separate columns
for idx, word in index_word.items():
    merchant_full_expanded = merchant_full_expanded.withColumn(
        f"category_{word}", 
        col("category_binary")[idx]
    )
merchant_full_expanded = merchant_full_expanded.drop("category_binary")
merchant_full_expanded.show(5, truncate=False)

+------------------------------------+------------+--------------+------------------+---------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+-----------+---------------+-------------

### Convert `order_datetime` column to timestamp data type

In [22]:
merchant_full_expanded = merchant_full_expanded.withColumn(
    "order_timestamp", unix_timestamp(col("order_datetime"))
)
merchant_full_expanded.show(5, truncate=False)

+------------------------------------+------------+--------------+------------------+---------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+-----------+---------------+-------------

### Check the shape of transformed dataset

Drop unused columns:

In [23]:
cols_to_remove = [
    'category_including', 'category_other', 'category_and', 'category_except',
    'category_al', 'category_shop'
]
merchant_full_expanded = merchant_full_expanded.drop(*cols_to_remove)

In [24]:
num_rows = merchant_full_expanded.count()
print(f"Number of rows: {num_rows}")

num_columns = len(merchant_full_expanded.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 11372745
Number of columns: 98


In [25]:
# save as a parquet file
merchant_full_expanded.write.parquet('../data/curated/merchant_full_expanded', mode='overwrite')

                                                                                