# Feature Engineering

### 1. Unregistered customers

Start spark session

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
from functools import reduce

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import countDistinct, col

In [3]:
sp = SparkSession.builder.appName("Fraud detection").getOrCreate()

22/10/04 00:21:11 WARN Utils: Your hostname, Harshitas-MacBook-Air-8.local resolves to a loopback address: 127.0.0.1; using 192.168.0.227 instead (on interface en0)
22/10/04 00:21:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 00:21:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/04 00:21:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Read relevant datasets

In [4]:
def merge_folder(trans_group: str):
    """
    Function to merge everything within yellow or green or fhvhv
    """
    dir = "../data/tables/" + trans_group +"/"
    folder_locs = os.listdir(dir)

    group_list = []
    for folder in folder_locs:
        path = dir + "/" + folder
        if os.path.isdir(path):
            # print("At current path", path)
            group_list.append(sp.read.parquet(path))

    return reduce(DataFrame.unionAll, group_list)

In [5]:
dir = "../data/tables/"
groups = ["transactions_20210228_20210827_snapshot/", "transactions_20210828_20220227_snapshot/", "transactions_20220228_20220828_snapshot/"]

final_list = []
for g in groups:
    print("Started group: ", g)
    final_list.append(sp.read.parquet(dir + g))

transactions = reduce(DataFrame.unionAll, final_list)

Started group:  transactions_20210228_20210827_snapshot/


                                                                                

Started group:  transactions_20210828_20220227_snapshot/


                                                                                

Started group:  transactions_20220228_20220828_snapshot/


In [6]:
merch = sp.read.parquet("../data/tables/tbl_merchants.parquet")
cust  = sp.read.option("header", True).option("delimiter", "|") \
        .csv("../data/tables/tbl_consumer.csv")

Define functions

In [7]:
def unregistered_customers(merchants, customers, transactions):
    '''
    Args:
        merchants (pyspark.sql.DataFrame)    : Df with details about all the  merchants, including their 'merchant_abn'

        customers (pyspark.sql.DataFrame)    : Df with details about all the customers, including their 'consumer_id'

        transactions (pyspark.sql.DataFrame) : Df with details about all the transactions made between merchants and customers

    Returns:
        A pyspark.sql.DataFrame with all the transactions that have a registered Merchant ABN but an unknown user/customer ID.
    '''
    
    # list of registered merchant ABNs
    abn_list = merchants.rdd.map(lambda x: x.merchant_abn).collect()

    # transactions with registered merchant ABNs
    reg_merchant_trans = transactions[transactions.merchant_abn.isin(abn_list)]

    # total transactions with unidentified customers
    unknown_cust = (transactions.select('user_id').distinct()) \
                    .subtract(cust.select(col('consumer_id')))
    unknown_cust_list = unknown_cust.rdd.map(lambda x: x.user_id).collect()

    # transactions with registered merchant ABNs but unknown customer IDs
    return reg_merchant_trans[reg_merchant_trans.user_id.isin(unknown_cust_list)]

In [8]:
def create_columns(unknown_cust_trans, merchants):
    '''
    Args:
        unknown_cust_trans (pyspark.sql.DataFrame) : Df with all the transactions that have a registered Merchant ABN but an unknown user/customer ID.
        
        merchants (pyspark.sql.DataFrame)          : Df with details about all the  merchants, including their 'merchant_abn'

    Returns:
        Updated 'merchants' df with two new columns.
    '''

    # number of transactions with unknown users for each merchant 
    trans_count = unknown_cust_trans.groupBy("merchant_abn").count() \
                    .withColumnRenamed("count", "unknown_users_trans")

    # number of unknown customers for each merchant
    users_count = unknown_cust_trans.groupBy("merchant_abn") \
                    .agg(countDistinct("user_id")) \
                    .withColumnRenamed("count(user_id)", "unknown_users_count")

    # add relevant counts as new columns to the merchant dataset
    merchants = merchants.join(trans_count, ["merchant_abn"])
    merchants = merchants.join(users_count, ["merchant_abn"])

    return merchants

Add features to merchant dataset

In [9]:
unreg_custs = unregistered_customers(merch, cust, transactions)
merch = create_columns(unreg_custs, merch)

                                                                                

In [10]:
merch.show(5)

22/10/04 00:23:05 WARN DAGScheduler: Broadcasting large task binary with size 1205.4 KiB
22/10/04 00:23:06 WARN DAGScheduler: Broadcasting large task binary with size 1200.3 KiB




22/10/04 00:23:20 WARN DAGScheduler: Broadcasting large task binary with size 1202.9 KiB


[Stage 17:>                                                         (0 + 8) / 9]

22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/04 00:23:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/10/04 00:23:29 WARN DAGScheduler: Broadcasting large task binary with size 1198.3 KiB


                                                                                

22/10/04 00:23:29 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
+------------+--------------------+--------------------+-------------------+-------------------+
|merchant_abn|                name|                tags|unknown_users_trans|unknown_users_count|
+------------+--------------------+--------------------+-------------------+-------------------+
| 38700038932|Etiam Bibendum In...|[(tent and awning...|               4715|               4090|
| 19839532017|Pellentesque Habi...|([cable, Satellit...|                494|                484|
| 35344855546|Quis Tristique Ac...|[(watch, clock, a...|                984|                957|
| 83412691377|Suspendisse Sagit...|([watch, clock, a...|               9408|               7146|
| 15613631617|     Ante Industries|[[motor vehicle s...|               1211|               1162|
+------------+--------------------+--------------------+-------------------+-------------------+
only showing top 5 rows

