In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [3]:
sp = SparkSession.builder.appName("Checker").getOrCreate()
sp

In [4]:
trans = sp.read.option("inferSchema", True).parquet("../data/curated/transactions")
trans.show(3)

                                                                                

+-------+------------+------------+--------------+-----------+
|user_id|merchant_abn|dollar_value|order_datetime|   order_id|
+-------+------------+------------+--------------+-----------+
|  14935| 79417999332|      136.07|    2021-11-26|68719476736|
|      1| 46451548968|       72.62|    2021-11-26|68719476737|
|  14936| 89518629617|        3.08|    2021-11-26|68719476738|
+-------+------------+------------+--------------+-----------+
only showing top 3 rows



In [5]:
trans.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- order_id: long (nullable = true)



In [None]:
# Description for the function is below
def potential_outlier(full_dataset):
    '''
    # Outlier detection algorithm

    This will be inplemented by creating an attribute called 'potential outlier'. which marks dollar 
    values of transactios that fall out of a companies specific SIQR as True, and False otherwise. 
    Furthermore, it marks all transactions that belong to a company, which has no variance 
    in the dollar value of respective transactios. This is due to it being unrealistic/dodgy.
    
    Note: after further investigating the 'dodgy' transactions, consistent dollar values for all merchant's 
    should be reconsidered, as some fall under the 'tv subscription' description, which should be consistent
    anyway
    '''
    # In a similar method to the boxplot, we use the SIQR to identify outliers
    # see https://towardsdatascience.com/outlier-detection-part-2-6839f6199768
    Fst_percentile = F.expr('percentile_approx(dollar_value, 0.25)')
    Trd_percentile = F.expr('percentile_approx(dollar_value, 0.75)')
    Second_percentile = F.expr('percentile_approx(dollar_value, 0.5)')
    Outlier_tags = full_dataset.groupBy('merchant_abn').agg(Fst_percentile.alias('1_val'), Trd_percentile.alias('3_val'), Second_percentile.alias('2_val'), F.count('dollar_value').alias('Count'))
    Outlier_tags = Outlier_tags.withColumn('SIQR_Lower', F.col('2_val') - F.col('1_val'))
    Outlier_tags = Outlier_tags.withColumn('SIQR_Upper', F.col('3_val') - F.col('2_val'))
    # Now calculate the limits
    Outlier_tags = Outlier_tags.withColumn('Upper_limit', F.col('3_val') + 3 * F.col('SIQR_Upper'))
    Outlier_tags = Outlier_tags.withColumn('Lower_limit', F.col('1_val') - 3 * F.col('SIQR_Lower'))
    # after noticing that some merchants only have one transaction value (i.e one dollar_value amount for all transactios)
    # decided to removed due to unrealisic distributed data 
    Outlier_tags = Outlier_tags.withColumn('Natural_var', F.when((F.col('Upper_limit') == F.col('Lower_limit')) & (F.col('Count') > 10), True).otherwise(False))
    Outlier_tags = Outlier_tags.select('merchant_abn', 'Upper_limit', 'Lower_limit', 'Natural_var')
    # Now all we need to do is join this data to each transaction, then can select the transactios which are (not) within the limits
    Outlier_tags = full_dataset.select('merchant_abn', 'order_id', 'user_id', 'dollar_value').join(Outlier_tags, on= ['merchant_abn'])
    # finally identify the outliers which fall out of distribution or apart of a dodgy business
    Outlier_tags = Outlier_tags.withColumn('Potential_Outlier', F.when((Outlier_tags.dollar_value <= F.col('Upper_limit')) & (Outlier_tags.dollar_value >= F.col('Lower_limit')) & (F.col('Natural_var') == False), False)
                                                .otherwise(True))
    # Join the new attributes obtained above to the transaction spark dataframe
    Outlier_tags = Outlier_tags.select(['order_id', 'Natural_var', 'Potential_Outlier'])
    full_dataset = full_dataset.join(Outlier_tags, on='order_id')
    return full_dataset