# Transaction Analysis

---

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
from pyspark.sql.functions import when, col, count

In [2]:
spark = (
    SparkSession.builder.appName("Imputation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 3000)
    .config("spark.network.timeout", "300s")
    .config("spark.driver.maxResultSize", "4g")
    .config("spark.rpc.askTimeout", "300s")
    .config("spark.driver.memory", "8G")
    .config("spark.executor.memory", "8G")
    .getOrCreate()
)

24/09/18 19:43:25 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/18 19:43:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/18 19:43:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read datasets

In [3]:
final_imputed_consumer = spark.read.parquet('../data/curated/final_imputed_consumer')
final_imputed_merchant = spark.read.parquet('../data/curated/final_imputed_merchant')

                                                                                

# Denoting fraud for consumer dataset

In [4]:
# create a new column `consumer_is_fraud` based on the fraud_probability threshold of 50
final_imputed_consumer = final_imputed_consumer.withColumn(
    'consumer_is_fraud', 
    when(col('fraud_probability') >= 50, 1).otherwise(0)
)

final_imputed_consumer = final_imputed_consumer.withColumnRenamed('fraud_probability', 
                                                                  'consumer_fraud_probability')
final_imputed_consumer.select('consumer_fraud_probability', 
                              'consumer_is_fraud').show()

                                                                                

+--------------------------+-----------------+
|consumer_fraud_probability|consumer_is_fraud|
+--------------------------+-----------------+
|        11.998913422103769|                0|
|        11.897039565334778|                0|
|        15.238617000084226|                0|
|        11.897039565334778|                0|
|        16.511154537793757|                0|
|        11.897039565334778|                0|
|        16.511154537793757|                0|
|        16.511154537793757|                0|
|        10.825871482844985|                0|
|        10.825871482844985|                0|
|        12.135274548815593|                0|
|         8.848401761937838|                0|
|        12.135274548815593|                0|
|         8.848401761937838|                0|
|        12.604746749130138|                0|
|         17.26103891349819|                0|
|        12.604746749130138|                0|
|        14.647115130997538|                0|
|        12.6

In [6]:
# count the total number of records
total_count = final_imputed_consumer.count()

# calculate the percentage of each category
fraud_group = final_imputed_consumer.groupBy('consumer_is_fraud').count()
fraud_group = fraud_group.withColumn("percentage", (col("count") / total_count) * 100)
fraud_group.show()

[Stage 6:>                                                         (0 + 8) / 11]

+-----------------+--------+------------------+
|consumer_is_fraud|   count|        percentage|
+-----------------+--------+------------------+
|                1|  188942|1.6613579219440864|
|                0|11183803| 98.33864207805591|
+-----------------+--------+------------------+



                                                                                

In [7]:
# save as a parquet file
final_imputed_consumer.write.parquet('../data/curated/full_consumer_with_is_fraud', mode='overwrite')
final_imputed_consumer.show(5)

                                                                                

+-------+--------+--------------+-----------+------------------+-----+------------+------------------+--------------------+--------------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-----------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+-------------------+-----------------+
|user_id|postcode|order_datetime|consumer_id|              name|state|merchant_abn|      dollar_value|            order_id|consumer_fraud_probability|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|st

# Denoting fraud for merchant dataset

In [5]:
# create a new column 'merchant_is_fraud' based on the fraud_probability threshold of 50
final_imputed_merchant = final_imputed_merchant.withColumn(
    'merchant_is_fraud', 
    when(col('fraud_probability') >= 50, 1).otherwise(0)
)

final_imputed_merchant = final_imputed_merchant.withColumnRenamed('fraud_probability', 
                                                                  'merchant_fraud_probability')
final_imputed_merchant.select('merchant_abn', 'merchant_fraud_probability', 
                              'merchant_is_fraud').show()

+------------+--------------------------+-----------------+
|merchant_abn|merchant_fraud_probability|merchant_is_fraud|
+------------+--------------------------+-----------------+
| 98861937890|         29.01220845569412|                0|
| 23338656015|        39.304943432560265|                0|
| 80324045558|          67.3619986198918|                1|
| 76626119831|         78.18254756857768|                1|
| 46607130088|        30.098683575840507|                0|
| 24015173965|         28.50798360263134|                0|
| 78798828265|         28.50563911156905|                0|
| 24852446429|         61.91904820195791|                1|
| 64203420245|         53.17657240618603|                1|
| 32560075533|         37.49203556354546|                0|
| 90543168331|          66.6916726014333|                1|
| 91338194166|         30.71465274195495|                0|
| 22033359776|          32.3590887982333|                0|
| 70172340121|         70.41867759241458

In [23]:
# count the total number of records
total_count = final_imputed_merchant.count()

# calculate the percentage of each category
fraud_group = final_imputed_merchant.groupBy('merchant_is_fraud').count()
fraud_group = fraud_group.withColumn("percentage", (col("count") / total_count) * 100)
fraud_group.show()

+-----------------+-------+------------------+
|merchant_is_fraud|  count|        percentage|
+-----------------+-------+------------------+
|                1|3717472|32.687552565365706|
|                0|7655273|  67.3124474346343|
+-----------------+-------+------------------+



In [10]:
# save as a parquet file
final_imputed_merchant.write.parquet('../data/curated/full_merchant_with_is_fraud', mode='overwrite')
final_imputed_merchant.show(5)

                                                                                

+------------+--------------------+--------------+------------------+---------+--------------------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+-----------+---------------+---------------+----

# Merge the full consumer and merchant dataset

In [6]:
# use `merchant_abn` and `order_id` as join keys
full_transaction = final_imputed_merchant.alias('merchant') \
                                         .join(final_imputed_consumer.alias('consumer'), 
                                               ['merchant_abn', 'order_id'], 'inner')

# keep the unique columns in each table and remove the duplicate columns in `final_imputed_consumer`
columns_to_drop = [col for col in final_imputed_consumer.columns if col in final_imputed_merchant.columns and col != 'merchant_abn' and col != 'order_id']
full_transaction = full_transaction.drop(*columns_to_drop)
full_transaction.show(5)

[Stage 8:>                                                          (0 + 1) / 1]

+------------+--------------------+---------+--------------------------+-------------------+-------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+-----------+---------------+---------------+----------------+----------------+---

                                                                                

In [15]:
columns_to_drop

['order_datetime', 'dollar_value', 'order_timestamp']

In [16]:
full_transaction.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_id: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- merchant_fraud_probability: double (nullable = true)
 |-- transaction_revenue: double (nullable = true)
 |-- BNLP_revenue: double (nullable = true)
 |-- revenue_level_e: integer (nullable = true)
 |-- revenue_level_d: integer (nullable = true)
 |-- revenue_level_c: integer (nullable = true)
 |-- revenue_level_b: integer (nullable = true)
 |-- revenue_level_a: integer (nullable = true)
 |-- category_jewelry: integer (nullable = true)
 |-- category_art: integer (nullable = true)
 |-- category_television: integer (nullable = true)
 |-- category_watch: integer (nullable = true)
 |-- category_cable: integer (nullable = true)
 |-- category_repair: integer (nullable = true)
 |-- category_stock: integer (nullable = true)
 |-- category_flower: integer (nullable = true)
 |-- category_office: integer (nullable = true)
 |-- category_souvenir: integer (nullable = true)

Check the shape of merged dataset:

In [7]:
num_rows = full_transaction.count()
print(f"Number of rows: {num_rows}")

num_columns = len(full_transaction.columns)
print(f"Number of columns: {num_columns}")



Number of rows: 11372745
Number of columns: 131


                                                                                

# Add `transaction_is_fraud` column to indicate whether this transaction is fraud

Rule: if either `consumer_is_fraud` or `merchant_is_fraud` is 1, then this transaction is determined as fraud (denoted as 1).

In [9]:
# add new column `transaction_is_fraud`
full_transaction = full_transaction.withColumn(
    'transaction_is_fraud',
    when((col('consumer_is_fraud') == 1) | (col('merchant_is_fraud') == 1), 1).otherwise(0)
)

full_transaction.select('consumer_is_fraud', 'merchant_is_fraud', 'transaction_is_fraud').show()

[Stage 31:>                                                         (0 + 1) / 1]

+-----------------+-----------------+--------------------+
|consumer_is_fraud|merchant_is_fraud|transaction_is_fraud|
+-----------------+-----------------+--------------------+
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   

                                                                                

In [18]:
full_transaction.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_id: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- merchant_fraud_probability: double (nullable = true)
 |-- transaction_revenue: double (nullable = true)
 |-- BNLP_revenue: double (nullable = true)
 |-- revenue_level_e: integer (nullable = true)
 |-- revenue_level_d: integer (nullable = true)
 |-- revenue_level_c: integer (nullable = true)
 |-- revenue_level_b: integer (nullable = true)
 |-- revenue_level_a: integer (nullable = true)
 |-- category_jewelry: integer (nullable = true)
 |-- category_art: integer (nullable = true)
 |-- category_television: integer (nullable = true)
 |-- category_watch: integer (nullable = true)
 |-- category_cable: integer (nullable = true)
 |-- category_repair: integer (nullable = true)
 |-- category_stock: integer (nullable = true)
 |-- category_flower: integer (nullable = true)
 |-- category_office: integer (nullable = true)
 |-- category_souvenir: integer (nullable = true)

Check the shape of the dataset:

In [10]:
num_rows = full_transaction.count()
print(f"Number of rows: {num_rows}")

num_columns = len(full_transaction.columns)
print(f"Number of columns: {num_columns}")



Number of rows: 11372745
Number of columns: 132


                                                                                

Confirm there is no nulls:

In [11]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in full_transaction.columns}

# use agg() to calculate null counts for each column
null_counts_df = full_transaction.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------+---------+--------------------------+-------------------+------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+-----------+---------------+---------------+----------------+----------------+---------------+------

                                                                                

In [28]:
# save as a parquet file
full_transaction.write.parquet('../data/curated/full_transaction', mode='overwrite')

                                                                                

In [12]:
# calculate the percentage of each category
is_fraud_group = full_transaction.groupBy('transaction_is_fraud').count()
is_fraud_group = is_fraud_group.withColumn("percentage", (col("count") / num_rows) * 100)
is_fraud_group.show()



+--------------------+-------+-----------------+
|transaction_is_fraud|  count|       percentage|
+--------------------+-------+-----------------+
|                   1|3843973|33.79986977638204|
|                   0|7528772|66.20013022361796|
+--------------------+-------+-----------------+



                                                                                