# Feature Engineering

### 1. Unregistered customers

Start spark session

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
from functools import reduce

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import countDistinct, col
import pyspark.sql.functions as F
from collections import defaultdict

In [2]:
sp = SparkSession.builder.appName("Fraud detection").getOrCreate()

22/10/04 16:10:36 WARN Utils: Your hostname, Harshitas-MacBook-Air-8.local resolves to a loopback address: 127.0.0.1; using 10.13.133.97 instead (on interface en0)
22/10/04 16:10:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 16:10:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Read relevant datasets

In [3]:
def merge_folder(trans_group: str):
    """
    Function to merge everything within yellow or green or fhvhv
    """
    dir = "../data/tables/" + trans_group +"/"
    folder_locs = os.listdir(dir)

    group_list = []
    for folder in folder_locs:
        path = dir + "/" + folder
        if os.path.isdir(path):
            # print("At current path", path)
            group_list.append(sp.read.parquet(path))

    return reduce(DataFrame.unionAll, group_list)

In [4]:
dir = "../data/tables/"
groups = ["transactions_20210228_20210827_snapshot/", "transactions_20210828_20220227_snapshot/", "transactions_20220228_20220828_snapshot/"]

final_list = []
for g in groups:
    print("Started group: ", g)
    final_list.append(sp.read.parquet(dir + g))

transactions = reduce(DataFrame.unionAll, final_list)

Started group:  transactions_20210228_20210827_snapshot/


                                                                                

Started group:  transactions_20210828_20220227_snapshot/


                                                                                

Started group:  transactions_20220228_20220828_snapshot/


                                                                                

In [5]:
merch = sp.read.parquet("../data/tables/tbl_merchants.parquet")
cust  = sp.read.option("header", True).option("delimiter", "|") \
        .csv("../data/tables/tbl_consumer.csv")

Define functions

In [34]:
def unregistered_customers(merchants, customers, transactions):
    '''
    Args:
        merchants (pyspark.sql.DataFrame)    : Df with details about all the  merchants, including their 'merchant_abn'

        customers (pyspark.sql.DataFrame)    : Df with details about all the customers, including their 'consumer_id'

        transactions (pyspark.sql.DataFrame) : Df with details about all the transactions made between merchants and customers

    Returns:
        A pyspark.sql.DataFrame with all the transactions that have a registered Merchant ABN but an unknown user/customer ID.
    '''
    
    # list of registered merchant ABNs
    abn_list = merchants.rdd.map(lambda x: x.merchant_abn).collect()

    # transactions with registered merchant ABNs
    reg_merchant_trans = transactions[transactions.merchant_abn.isin(abn_list)]

    # total transactions with unidentified customers
    unknown_cust = (transactions.select('user_id').distinct()) \
                    .subtract(cust.select(col('consumer_id')))
    unknown_cust_list = unknown_cust.rdd.map(lambda x: x.user_id).collect()

    # transactions with registered merchant ABNs but unknown customer IDs
    return reg_merchant_trans[reg_merchant_trans.user_id.isin(unknown_cust_list)]

In [35]:
def create_columns(unknown_cust_trans, merchants):
    '''
    Args:
        unknown_cust_trans (pyspark.sql.DataFrame) : Df with all the transactions that have a registered Merchant ABN but an unknown user/customer ID.
        
        merchants (pyspark.sql.DataFrame)          : Df with details about all the  merchants, including their 'merchant_abn'

    Returns:
        Updated 'merchants' df with two new columns.
    '''

    # number of transactions with unknown users for each merchant 
    trans_count = unknown_cust_trans.groupBy("merchant_abn").count() \
                    .withColumnRenamed("count", "unknown_users_trans")

    # number of unknown customers for each merchant
    users_count = unknown_cust_trans.groupBy("merchant_abn") \
                    .agg(countDistinct("user_id")) \
                    .withColumnRenamed("count(user_id)", "unknown_users_count")

    # add relevant counts as new columns to the merchant dataset
    merchants = merchants.join(trans_count, ["merchant_abn"])
    merchants = merchants.join(users_count, ["merchant_abn"])

    return merchants

Add features to merchant dataset

In [36]:
unreg_custs = unregistered_customers(merch, cust, transactions)
merch = create_columns(unreg_custs, merch)

                                                                                

In [37]:
merch.show(5)

22/10/04 16:50:35 WARN DAGScheduler: Broadcasting large task binary with size 1205.4 KiB
22/10/04 16:50:35 WARN DAGScheduler: Broadcasting large task binary with size 1200.2 KiB


[Stage 48:>                (0 + 8) / 26][Stage 49:>                 (0 + 0) / 8]

22/10/04 16:50:43 WARN DAGScheduler: Broadcasting large task binary with size 1191.8 KiB


                                                                                

22/10/04 16:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1202.8 KiB


[Stage 53:>                                                         (0 + 8) / 9]

22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:50:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/04 16:51:00 WARN DAGScheduler: Broadcasting large task binary with size 1198.2 KiB


                                                                                

+------------+--------------------+--------------------+--------------------+-------------------+-------------------+
|merchant_abn|                name|                tags|     avg_monthly_inc|unknown_users_trans|unknown_users_count|
+------------+--------------------+--------------------+--------------------+-------------------+-------------------+
| 12516851436|        Mollis Corp.|((watch, clock, a...|                -0.2|                134|                134|
| 15613631617|     Ante Industries|[[motor vehicle s...|                 0.0|               1211|               1162|
| 19839532017|Pellentesque Habi...|([cable, Satellit...|-0.04761904761904...|                494|                484|
| 15700338102| Aliquam Ornare Inc.|((furniture, home...|                -0.3|                148|                148|
| 20497101151|Arcu Vestibulum A...|[[telecom], [b], ...|               -0.15|                173|                172|
+------------+--------------------+--------------------+

### 2. Customer base growth
The monthly increase in the number of customers per merchant, used to quantify business growth.

In [31]:
def create_cust_growth_column(merchants, transactions):
    '''
    Args:
        merchants (pyspark.sql.DataFrame)    : Df with details about all the  merchants, including their 'merchant_abn'

        transactions (pyspark.sql.DataFrame) : Df with details about all the transactions made between merchants and customers

    Returns:
        Updated 'merchants' df with one new column.
    '''

    # add monthly customer increase as a new column to the merchant dataset
    cust_growth = aggregate_monthly(transactions)
    merchants = merchants.join(cust_growth, ["merchant_abn"])

    return merchants


def aggregate_monthly(transactions):
    '''
    Args:
        transactions (pyspark.sql.DataFrame) : Df with details about all the transactions made between merchants and customers

    Returns:
        A pyspark.sql.DataFrame with the average monthly increase in the number of customer for every merchant_abn
    '''
    monthly_trans = transactions.withColumn("order_month", 
                                F.date_format('order_datetime','yyyy-MM'))
    monthly = monthly_trans.groupBy("merchant_abn", "order_month").agg(countDistinct('user_id')).withColumnRenamed("count(user_id)", "distinct_customers")
    sorted_monthly = monthly.sort(['merchant_abn', 'order_month'])

    return get_monthly_increase(sorted_monthly.toPandas())
    

def get_monthly_increase(monthly_df):
    '''
    Args:
        monthly_df (pandas.DataFrame) : Df with the distinct number of customers that made transactions with a particular merchant every month

    Returns:
        A pyspark.sql.DataFrame with the average monthly increase in the number of customer for every merchant_abn
    '''
    curr_abn = monthly_df['merchant_abn'][0]
    differences = []
    abns = []
    incs = []
    for i in range(monthly_df.shape[0] - 1):
        if monthly_df['merchant_abn'][i] != curr_abn:
            abns.append(curr_abn)
            incs.append(sum(differences) / len(differences))

            curr_abn = monthly_df['merchant_abn'][i]
            differences = []

        differences.append(monthly_df['distinct_customers'][i+1] - monthly_df['distinct_customers'][i])

    growth = pd.DataFrame.from_dict({"merchant_abn": abns, "avg_monthly_inc": incs})
    return sp.createDataFrame(growth)


In [32]:
merch = create_cust_growth_column(merch, transactions)

[Stage 27:>                                                         (0 + 8) / 9]

22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 16:48:38 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                