# Removal of unregistered merchant / customer transactions

- Remove transactions for unregistered merchants altogether
- Remove transactions from unregistered customers for fraud probability prediction model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
from functools import reduce

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import countDistinct, col
import pyspark.sql.functions as F
from collections import defaultdict

In [2]:
sp = SparkSession.builder.appName("Fraud detection").getOrCreate()

22/10/04 17:42:40 WARN Utils: Your hostname, Harshitas-MacBook-Air-8.local resolves to a loopback address: 127.0.0.1; using 10.13.133.97 instead (on interface en0)
22/10/04 17:42:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 17:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def merge_folder(trans_group: str):
    """
    Function to merge everything within yellow or green or fhvhv
    """
    dir = "../data/tables/" + trans_group +"/"
    folder_locs = os.listdir(dir)

    group_list = []
    for folder in folder_locs:
        path = dir + "/" + folder
        if os.path.isdir(path):
            # print("At current path", path)
            group_list.append(sp.read.parquet(path))

    return reduce(DataFrame.unionAll, group_list)

In [4]:
dir = "../data/tables/"
groups = ["transactions_20210228_20210827_snapshot/", "transactions_20210828_20220227_snapshot/", "transactions_20220228_20220828_snapshot/"]

final_list = []
for g in groups:
    print("Started group: ", g)
    final_list.append(sp.read.parquet(dir + g))

transactions = reduce(DataFrame.unionAll, final_list)

Started group:  transactions_20210228_20210827_snapshot/


                                                                                

Started group:  transactions_20210828_20220227_snapshot/


                                                                                

Started group:  transactions_20220228_20220828_snapshot/


                                                                                

In [9]:
merch = sp.read.parquet("../data/tables/tbl_merchants.parquet")
cust  = sp.read.option("header", True).option("delimiter", "|") \
        .csv("../data/tables/tbl_consumer.csv")

In [6]:
def remove_unreg_merchants(trans, merch):
    # list of registered merchant ABNs
    abn_list = merch.rdd.map(lambda x: x.merchant_abn).collect()

    # transactions with registered merchant ABNs
    return trans[trans.merchant_abn.isin(abn_list)]

In [15]:
print("Total transactions               : ", transactions.count())
cleaned_transactions = remove_unreg_merchants(transactions, merch)
print("Transactions with known merchants: ", cleaned_transactions.count())

                                                                                

Total transactions               :  13614675




Transactions with known merchants:  13614675


                                                                                

In [16]:
def remove_unreg_cust(trans, cust):
    # list of registered customer IDs
    unknown_cust = (trans.select('user_id').distinct()) \
                    .subtract(cust.select(col('consumer_id')))
    unknown_cust_list = unknown_cust.rdd.map(lambda x: x.user_id).collect()

    # transactions with registered customer IDs
    return trans[trans.user_id.isin(unknown_cust_list) == False]

In [18]:
print("Transactions with known merchants         : ", cleaned_transactions.count())
legit_cust_trans = remove_unreg_cust(cleaned_transactions, cust)
print("Transactions available for fraud detection: ", legit_cust_trans.count())

                                                                                

Transactions with known merchants         :  13614675




Transactions available for fraud detection:  4552772


                                                                                