# Feature Engineering

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
import pandas as pd
import geopandas as gpd

In [2]:
spark = (
    SparkSession.builder.appName("Feature_Engineering")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/09/08 18:03:33 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/08 18:03:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/08 18:03:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/08 18:03:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/08 18:03:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/09/08 18:03:34 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


# Read datasets

In [17]:
consumer_detail  = spark.read.parquet('../data/curated/consumer_detail')
transaction = spark.read.parquet('../data/curated/transaction')
merchant  = spark.read.parquet('../data/curated/merchant')
postcode_sa2_geo = gpd.read_file('../data/curated/postcode_sa2_geo.shp')

### Convert data types

In [18]:
print('consumer_detail:')
consumer_detail.printSchema()

print('\ntransaction:')
transaction.printSchema()

print('\nmerchant:')
merchant.printSchema()

print('\npostcode_sa2_geo:')
postcode_sa2_geo.dtypes

consumer_detail:
root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)


transaction:
root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)


merchant:
root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)


postcode_sa2_geo:


SA2_name        object
SA2_MAINCO       int64
postcode         int64
state           object
geometry      geometry
dtype: object

In [21]:
# convert the data type of postcode column in consumer_transaction dataset 
consumer_detail = consumer_detail.withColumn('postcode', col('postcode').cast('int'))
consumer_detail.printSchema()

root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



### Display dataset

In [22]:
consumer_detail.show(5)

+-----------+-------+-----------------+-----+--------+------+
|consumer_id|user_id|             name|state|postcode|gender|
+-----------+-------+-----------------+-----+--------+------+
|     870353| 213579|    Charles Davis|   SA|    5261|  Male|
|     923963| 213580|Jacqueline Nelson|  QLD|    4744|Female|
|      93016| 213581|    Carolyn Smith|  QLD|    4454|Female|
|      61324| 213582|      Denise Rush|   WA|    6705|Female|
|     823311| 213583|  Nathan Williams|  NSW|    2145|  Male|
+-----------+-------+-----------------+-----+--------+------+
only showing top 5 rows



In [23]:
consumer_detail.count()

499999

In [24]:
consumer_detail.show(5)

+-----------+-------+-----------------+-----+--------+------+
|consumer_id|user_id|             name|state|postcode|gender|
+-----------+-------+-----------------+-----+--------+------+
|     870353| 213579|    Charles Davis|   SA|    5261|  Male|
|     923963| 213580|Jacqueline Nelson|  QLD|    4744|Female|
|      93016| 213581|    Carolyn Smith|  QLD|    4454|Female|
|      61324| 213582|      Denise Rush|   WA|    6705|Female|
|     823311| 213583|  Nathan Williams|  NSW|    2145|  Male|
+-----------+-------+-----------------+-----+--------+------+
only showing top 5 rows



In [25]:
transaction.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  14935| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936| 31101120643|25.228114942417797|8e301c0f-06ab-45c...|    2021-11-26|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [26]:
transaction.count()

13614675

# Compute the number of orders and total `dollar_value`

In [27]:
# group by merchant_abn and aggregate the count of orders and total dollar_value
transaction_detail = transaction.groupBy("merchant_abn").agg(
    count("order_id").alias("transaction_count"),  # assuming tip count is equivalent to the number of orders
    sum("dollar_value").alias("total_dollar_value")
)

transaction_detail.show(5)



+------------+-----------------+------------------+
|merchant_abn|transaction_count|total_dollar_value|
+------------+-----------------+------------------+
| 83412691377|            14288| 498536.5816973135|
| 38700038932|             7132| 9546185.360697312|
| 35344855546|             1522|134737.25046268434|
| 15613631617|             1785| 543030.5313328261|
| 19839532017|              726|          113982.0|
+------------+-----------------+------------------+
only showing top 5 rows



                                                                                

In [28]:
# save the parquet file
transaction_detail.write.parquet('../data/curated/transactions_count_dollar', mode='overwrite')

                                                                                

# Merge consumer and transaction by `user_id`

In [29]:
consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction.show(5)

+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|user_id|consumer_id|            name|state|postcode|gender|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|  14935|    1059280|   Cameron Adams|  QLD|    4563|  Male| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 31101120643|25.228114942417797|8e301

In [30]:
consumer_transaction.count()

13614675

In [31]:
# save the parquet file
consumer_transaction.write.parquet('../data/curated/consumer_transaction', mode='overwrite')

                                                                                

# Create total transaction values for each consumer with geometry dataset

In [32]:
consumer_transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [36]:
selected_consumer_transaction = consumer_transaction.select('postcode', 'dollar_value')
selected_consumer_transaction = selected_consumer_transaction.toPandas()
consumer_dollar_geo = selected_consumer_transaction.merge(postcode_sa2_geo, on='postcode', how='inner')

                                                                                

In [37]:
consumer_dollar_geo.head()

Unnamed: 0,postcode,dollar_value,SA2_name,SA2_MAINCO,state,geometry
0,4563,136.065708,Noosa Hinterland,316081549,QLD,"POLYGON ((152.79369 -26.34524, 152.79402 -26.3..."
1,4563,136.065708,Gympie Surrounds,319031514,QLD,"POLYGON ((152.51447 -26.05321, 152.5146 -26.05..."
2,4563,173.407114,Noosa Hinterland,316081549,QLD,"POLYGON ((152.79369 -26.34524, 152.79402 -26.3..."
3,4563,173.407114,Gympie Surrounds,319031514,QLD,"POLYGON ((152.51447 -26.05321, 152.5146 -26.05..."
4,4563,198.466744,Noosa Hinterland,316081549,QLD,"POLYGON ((152.79369 -26.34524, 152.79402 -26.3..."


In [38]:
consumer_dollar_geo.shape

(21008714, 6)

In [40]:
summed_consumer_dollar_geo = consumer_dollar_geo.groupby('SA2_name', as_index=False).agg({
    'dollar_value': 'sum',  # sum the dollar_value column
    'geometry': 'first'     # keep the first geometry of each SA2_name
})
summed_consumer_dollar_geo.head()

Unnamed: 0,SA2_name,dollar_value,geometry
0,ACT - South West,2002136.0,"POLYGON ((148.88381 -35.26411, 148.94988 -35.2..."
1,APY Lands,1400538.0,"POLYGON ((129.00186 -26.72252, 129.00186 -26.7..."
2,Abbotsford,527862.2,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8..."
3,Aberfoyle Park,624726.6,"POLYGON ((138.58963 -35.06584, 138.58993 -35.0..."
4,Acacia Gardens,736920.1,"POLYGON ((150.91593 -33.72971, 150.91661 -33.7..."


In [41]:
summed_consumer_dollar_geo.shape

(2311, 3)

In [42]:
# save the DataFrame as a CSV file
summed_consumer_dollar_geo.to_csv('../data/curated/summed_consumer_dollar_geo.csv', index=False)

In [43]:
# drop the geometry column
consumer_dollar_bySA2_df = summed_consumer_dollar_geo.drop(columns=['geometry'])

# convert the DataFrame as a Parquet file
consumer_dollar_bySA2 = spark.createDataFrame(consumer_dollar_bySA2_df)

# save the DataFrame as a Parquet file
consumer_dollar_bySA2.write.parquet('../data/curated/consumer_dollar_bySA2', mode='overwrite')

                                                                                