In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 12:01:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data_loc = "../data/tables/"

consumer_fraud = spark.read.option("header", "true").csv(data_loc+"consumer_fraud_probability.csv")
merchant_fraud = spark.read.option("header", "true").csv(data_loc+"merchant_fraud_probability.csv")
raw_join = spark.read.parquet("../data/curated/raw_join_internal_table/")
merchant_consumer_info = spark.read.parquet("../data/curated/merchant_consumer_info/")

                                                                                

<h2>Unsupervised Learning: to classify each instance into low, medium and high level of fraud probability</h2>

<h3>1. Determine whether an instance is an outlier based on dollor value (assumption: an outlier may be a fraud instance)</h3>

<h4>1.1 Extract and aggregate the dollar value for each user </h4>

In [3]:
# find the average dollor value for each user on each date
raw_join_date = (raw_join.groupBy("user_id", "order_datetime")
                        .agg(avg("dollar_value").alias("dollar_value_average"))
                        .orderBy("user_id"))

In [4]:
# find the average and standard deviation of dollor value for each user
join_std = (raw_join_date.groupBy("user_id")\
                         .agg(stddev_pop("dollar_value_average").alias("dollar_std"),
                              avg("dollar_value_average").alias("dollar_mean")))

<h4>1.2 Determine whether an instance is an outlier for corresponding user (0 is not outlier; 1 is outlier) </h4>

In [5]:
raw_join_outlier = (raw_join_date.join(join_std, "user_id", "left")
                   .withColumn("isOutlier",
                                F.when((abs(col("dollar_value_average") - col("dollar_mean")) <= 3 * col("dollar_std")), 0).otherwise(1))
                   .drop("dollar_mean")
                   .drop("dollar_std")
                   .drop("dollar_value_average"))

In [6]:
raw_join_outlier

[Stage 4:====>              (1 + 3) / 4][Stage 5:>                  (0 + 1) / 4]

22/09/19 12:01:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/19 12:01:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 27:====>             (1 + 3) / 4][Stage 28:>                 (0 + 1) / 4]

22/09/19 12:02:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:02:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,order_datetime,isOutlier
1,2021-03-21,0
1,2021-09-07,0
1,2021-05-23,0
1,2021-06-12,0
1,2021-09-06,0
1,2022-04-15,0
1,2022-04-02,0
1,2022-02-06,0
1,2021-03-11,0
1,2021-10-31,0


<h3>2. Join with consumer fraud probability</h3>

In [7]:
consumer_fraud = consumer_fraud.withColumnRenamed("user_id", "user_id_alt")
consumer_fraud = consumer_fraud.withColumnRenamed("order_datetime", "order_datetime_alt")

In [8]:
raw_join_outlier.createOrReplaceTempView("consumer")
consumer_fraud.createOrReplaceTempView("probability")

consumer_prob = spark.sql("""
SELECT 
    user_id, order_datetime, fraud_probability, isOutlier
FROM 
    consumer
LEFT JOIN 
    probability
ON 
    consumer.user_id == probability.user_id_alt
AND 
    consumer.order_datetime == probability.order_datetime_alt
""")


In [9]:
# fill the non exists probability with 0.1
consumer_prob = consumer_prob.withColumn("fraud_probability", 
                            F.when((col("fraud_probability").isNull()), 0.1).otherwise(F.col("fraud_probability")))

In [10]:
# convert data type of features to ensure they can be vectorize
consumer_prob = consumer_prob.withColumn("fraud_probability", F.col("fraud_probability").cast("float"))

<h3>3. Vectorize and standardize the features to ensure they are suitible for the clustering model (k-means)</h3>

In [11]:
# vectorized the features for model training
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["fraud_probability", "isOutlier"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

consumer_transformed = assembler.transform(consumer_prob.dropna('any'))

In [12]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(consumer_transformed)

consumer_transformed = scalerModel.transform(consumer_transformed)

[Stage 51:====>             (1 + 3) / 4][Stage 52:>                 (0 + 1) / 4]

22/09/19 12:02:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:02:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [13]:
from pyspark.ml.feature import VectorIndexer

featureIndexer = VectorIndexer(inputCol="scaledFeatures", \
                               outputCol="indexedFeatures",\
                               maxCategories=3165
).fit(consumer_transformed)

consumer_transformed = featureIndexer.transform(consumer_transformed)

[Stage 74:====>             (1 + 3) / 4][Stage 75:>                 (0 + 1) / 4]

22/09/19 12:03:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:03:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

<h3>4. Modelling using k-means and clustering </h3>

In [14]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='indexedFeatures', k=3)
model = kmeans.fit(consumer_transformed.select("indexedFeatures"))
consumer_kmean = model.transform(consumer_transformed)
consumer_kmean = consumer_kmean.withColumnRenamed("prediction", "fraud_group")


[Stage 93:====>             (1 + 3) / 4][Stage 94:>                 (0 + 1) / 4]

22/09/19 12:04:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:04:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 109:====>            (1 + 3) / 4][Stage 110:>                (0 + 1) / 4]

22/09/19 12:04:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:04:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 206:====>            (1 + 3) / 4][Stage 208:>                (0 + 1) / 4]

22/09/19 12:06:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:06:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [15]:
consumer_kmean

[Stage 230:====>            (1 + 3) / 4][Stage 231:>                (0 + 1) / 4]

22/09/19 12:07:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:07:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 248:====>            (1 + 3) / 4][Stage 250:>                (0 + 1) / 4]

22/09/19 12:07:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:07:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,order_datetime,fraud_probability,isOutlier,features,scaledFeatures,indexedFeatures,fraud_group
15432,2021-07-10,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
29,2021-05-11,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
474,2021-05-14,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
474,2022-04-09,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
964,2022-09-07,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
1677,2021-03-11,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
1677,2022-08-02,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
1697,2021-07-29,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
1806,2021-11-11,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0
1950,2022-09-18,0.1,0,[0.10000000149011...,[0.09229378721488...,[0.09229378721488...,0


<h3>5. Check the clustering is followed by the assumption </h3>

In [16]:
consumer_check = consumer_kmean.groupBy("fraud_group").agg(mean("fraud_probability").alias("mean_prob"), 
                                          sum("isOutlier").alias("outlier_amount"), 
                                          count("isOutlier").alias("total_instance"))
consumer_check                                         

[Stage 268:====>            (1 + 3) / 4][Stage 269:>                (0 + 1) / 4]

22/09/19 12:08:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:08:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 291:====>            (1 + 3) / 4][Stage 292:>                (0 + 1) / 4]

22/09/19 12:09:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:09:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

fraud_group,mean_prob,outlier_amount,total_instance
1,43.71548542520274,1795,2300
2,12.78110882133088,19832,30467
0,0.1000000014901161,123731,8724160


In [18]:
consumer_check = (consumer_check.withColumn("outlier_rate", F.col("outlier_amount")/F.col("total_instance"))
                               .drop(col("outlier_amount"))
                               .drop(col("total_instance")))

In [19]:
consumer_check

[Stage 314:====>            (1 + 3) / 4][Stage 315:>                (0 + 1) / 4]

22/09/19 12:11:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:11:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 336:====>            (1 + 3) / 4][Stage 338:>                (0 + 1) / 4]

22/09/19 12:11:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:11:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

fraud_group,mean_prob,outlier_rate
1,43.71548542520274,0.7804347826086957
2,12.78110882133088,0.6509337972232251
0,0.1000000014901161,0.0141825688662289


Summary:
1. According to mean_prob, we can know that 0 represents low fraud probability, 2 represents medum one and 1 represents high one for fraud_group.
2. The level of fraud probability increases as the outlier_rate increases as well.

<h3>6. Add the fraud_group feature to the internal joined dataset</h3>

In [20]:
add_fraud = raw_join.join(consumer_kmean, ["user_id", "order_datetime"], "left")

In [21]:
add_fraud.drop("fraud_probability", "isOutlier", "features", "scaledFeatures", "indexedFeatures")

[Stage 361:====>            (1 + 3) / 4][Stage 362:>                (0 + 1) / 4]

22/09/19 12:12:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:12:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 386:====>            (1 + 3) / 4][Stage 387:>                (0 + 1) / 4]

22/09/19 12:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 12:13:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,order_datetime,merchant_name,products,revenue_level,take_rate,category,dollar_value,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender,fraud_group
3471,2021-04-30,Pede Nonummy Corp.,tent and awning s...,c,2.86,3,20.151265837231254,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-07-14,Orci Corp.,florists supplies...,b,4.88,2,20.7707245923952,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-07-14,Pede Nonummy Corp.,tent and awning s...,c,2.86,3,32.18355598940389,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-07-31,Turpis In Incorpo...,gift card novelty...,c,2.48,1,16.421398817128605,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-08-08,Euismod Urna Inst...,tent and awning s...,b,5.05,3,25.973922708404825,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-08-08,Rhoncus Donec Ass...,tent and awning s...,b,4.45,3,18.23658677863049,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-08-26,Est Nunc Consulting,tent and awning s...,a,6.01,3,35.35344876130378,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2021-10-20,Et Nunc Consulting,books periodicals...,e,0.16,0,155.4258076824775,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2022-08-27,Et Nunc Consulting,books periodicals...,e,0.16,0,439.9426271986114,Todd Long,883 Patty Mountai...,NSW,2021,Male,0
3471,2022-10-01,Augue Eu Tempor A...,cable satellite a...,e,0.38,0,72.97005061998118,Todd Long,883 Patty Mountai...,NSW,2021,Male,0


In [None]:
# import time
# import datetime
# date = "2021-02-01"
# date_time = datetime.datetime.strptime(date, '%Y-%m-%d')

# user = 20984

In [None]:
# raw_join.where((col("order_datetime") == date_time.date()) & (col("user_id") == user))

In [None]:
# merchant_consumer_info.where((col("order_datetime") == date_time.date()) & (col("user_id") == user))