In [119]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [120]:
data_loc = "../data/tables/"

consumer_fraud = spark.read.option("header", "true").csv(data_loc+"consumer_fraud_probability.csv")
merchant_fraud = spark.read.option("header", "true").csv(data_loc+"merchant_fraud_probability.csv")
raw_join = spark.read.parquet("../data/curated/raw_join_internal_table/")
merchant_consumer_info = spark.read.parquet("../data/curated/merchant_consumer_info/")

In [121]:
# Group by date and find the average dollor value for that date

raw_join_date = raw_join.groupBy("user_id", "order_datetime", "consumer_postcode").agg(avg("dollar_value").alias("dollar_value_average")).orderBy("user_id")

In [122]:
join_std = (raw_join_date.groupBy("user_id").agg(stddev_pop("dollar_value_average").alias("dollar_std"),
                                                  avg("dollar_value_average").alias("dollar_mean")))

In [123]:
join_std



22/09/19 09:12:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




22/09/19 09:13:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,dollar_std,dollar_mean
1,276.34106357216945,143.06770514707347
2,618.4888288908787,170.6740096673979
3,361.7242539083343,170.0343625299024
4,195.03151495771863,139.98975771645502
5,428.2641502511229,154.27354975582938
6,201.2951850692621,129.4561571074095
7,330.1345837970235,173.06704305749295
8,229.43947734781872,149.48877244544494
9,177.67132924982852,129.33443867267525
10,781.6769053450531,187.0199511680986


In [124]:
raw_join_outlier = (raw_join_date.join(join_std, "user_id", "left")
                   .withColumn("isOutlier",
                                F.when((abs(col("dollar_value_average") - col("dollar_mean")) <= 3 * col("dollar_std")), 0).otherwise(1))
                   .drop("dollar_mean")
                   .drop("dollar_std")
                   .drop("dollar_value_average"))

In [125]:
raw_join_outlier.columns

['user_id', 'order_datetime', 'consumer_postcode', 'isOutlier']

In [126]:
raw_join_outlier.filter(F.col("isOutlier") == 1)

[Stage 1674:====>           (1 + 3) / 4][Stage 1675:>               (0 + 1) / 4]

22/09/19 09:13:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:13:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1692:====>           (1 + 3) / 4][Stage 1693:>               (0 + 1) / 4]

22/09/19 09:14:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:14:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

user_id,order_datetime,consumer_postcode,isOutlier
964,2022-02-27,6028,1
2927,2021-03-27,6798,1
11619,2021-08-18,5071,1
11745,2021-07-28,1232,1
18730,2022-05-02,2076,1
19907,2021-08-13,3844,1
2927,2022-06-11,6798,1
15194,2022-01-31,5580,1
16742,2021-06-15,4382,1
2214,2021-11-09,4505,1


Join consumer fraud probability

In [127]:
consumer_fraud = consumer_fraud.withColumnRenamed("user_id", "user_id_alt")
consumer_fraud = consumer_fraud.withColumnRenamed("order_datetime", "order_datetime_alt")

In [128]:
raw_join_outlier.createOrReplaceTempView("consumer")
consumer_fraud.createOrReplaceTempView("probability")

consumer_prob = spark.sql("""
SELECT 
    user_id, order_datetime, fraud_probability, consumer_postcode, isOutlier
FROM 
    consumer
LEFT JOIN 
    probability
ON 
    consumer.user_id == probability.user_id_alt
AND 
    consumer.order_datetime == probability.order_datetime_alt
""")


Fill the non exists probability with 0.1

In [129]:
consumer_prob = consumer_prob.withColumn("fraud_probability", 
                            F.when((col("fraud_probability").isNull()), 0.1).otherwise(F.col("fraud_probability")))

In [130]:
consumer_prob.filter(F.col("fraud_probability")>0.1).head(5)

[Stage 1711:====>           (1 + 3) / 4][Stage 1712:>               (0 + 1) / 4]

22/09/19 09:16:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:16:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

[Row(user_id=964, order_datetime=datetime.date(2022, 2, 27), fraud_probability='10.000639641611324', consumer_postcode='6028', isOutlier=1),
 Row(user_id=11619, order_datetime=datetime.date(2021, 8, 18), fraud_probability='10.604899415876009', consumer_postcode='5071', isOutlier=1),
 Row(user_id=15194, order_datetime=datetime.date(2022, 1, 31), fraud_probability='11.47165476467656', consumer_postcode='5580', isOutlier=1),
 Row(user_id=2214, order_datetime=datetime.date(2021, 11, 9), fraud_probability='9.06669796313919', consumer_postcode='4505', isOutlier=1),
 Row(user_id=15371, order_datetime=datetime.date(2021, 10, 13), fraud_probability='8.499010440619086', consumer_postcode='5572', isOutlier=0)]

In [131]:
consumer_prob = consumer_prob.withColumn("fraud_probability", F.col("fraud_probability").cast("float"))

In [132]:
consumer_prob.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: float (nullable = true)
 |-- consumer_postcode: string (nullable = true)
 |-- isOutlier: integer (nullable = false)



In [133]:
consumer_prob.select(countDistinct("consumer_postcode"))

                                                                                

count(DISTINCT consumer_postcode)
3165


In [134]:
consumer_prob = consumer_prob.withColumn("consumer_postcode", F.col("consumer_postcode").cast("Int"))

In [135]:
# vectorized the features for model training
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["fraud_probability", "isOutlier"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

consumer_transformed = assembler.transform(consumer_prob.dropna('any'))

In [136]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(consumer_transformed)

consumer_transformed = scalerModel.transform(consumer_transformed)

[Stage 1773:====>           (1 + 3) / 4][Stage 1775:>               (0 + 1) / 4]

22/09/19 09:19:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:19:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [137]:
from pyspark.ml.feature import VectorIndexer

featureIndexer = VectorIndexer(inputCol="scaledFeatures", \
                               outputCol="indexedFeatures",\
                               maxCategories=3165
).fit(consumer_transformed)

consumer_transformed = featureIndexer.transform(consumer_transformed)

[Stage 1797:====>           (1 + 3) / 4][Stage 1798:>               (0 + 1) / 4]

22/09/19 09:20:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:20:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [138]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='indexedFeatures', k=3)
model = kmeans.fit(consumer_transformed.select("indexedFeatures"))
consumer_kmean = model.transform(consumer_transformed)
consumer_kmean = consumer_kmean.withColumnRenamed("prediction", "fraud_group")


[Stage 1815:====>           (1 + 3) / 4][Stage 1817:>               (0 + 1) / 4]

22/09/19 09:22:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:22:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1831:====>           (1 + 3) / 4][Stage 1833:>               (0 + 1) / 4]

22/09/19 09:23:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:23:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1909:====>           (1 + 3) / 4][Stage 1911:>               (0 + 1) / 4]

22/09/19 09:26:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:26:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:26:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:26:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

In [139]:
consumer_kmean.count()

[Stage 1933:====>           (1 + 3) / 4][Stage 1934:>               (0 + 1) / 4]

22/09/19 09:27:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:28:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

8756927

In [140]:
consumer_check = consumer_kmean.groupBy("fraud_group").agg(mean("fraud_probability").alias("mean_prob"), 
                                          sum("isOutlier"), 
                                          count("isOutlier"))
consumer_check                                         

[Stage 1955:====>           (1 + 3) / 4][Stage 1957:>               (0 + 1) / 4]

22/09/19 09:29:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:29:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:29:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 1979:====>           (1 + 3) / 4][Stage 1980:>               (0 + 1) / 4]

22/09/19 09:31:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:31:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

fraud_group,mean_prob,sum(isOutlier),count(isOutlier)
1,12.78110882133088,19832,30467
2,43.71548542520274,1795,2300
0,0.1000000014901161,123731,8724160


In [147]:
consumer_check = consumer_check.withColumn("outlier_rate", F.col("sum(isOutlier)")/F.col("count(isOutlier)")).drop(col("sum(isOutlier)")).drop(col("count(isOutlier)"))

In [148]:
consumer_check

[Stage 2048:====>           (1 + 3) / 4][Stage 2049:>               (0 + 1) / 4]

22/09/19 09:42:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:42:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 2071:====>           (1 + 3) / 4][Stage 2072:>               (0 + 1) / 4]

22/09/19 09:44:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:44:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/19 09:44:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

fraud_group,mean_prob,outlier_rate
1,12.78110882133088,0.6509337972232251
2,43.71548542520274,0.7804347826086957
0,0.1000000014901161,0.0141825688662289


In [141]:
# import time
# import datetime
# date = "2021-02-01"
# date_time = datetime.datetime.strptime(date, '%Y-%m-%d')

# user = 20984

In [142]:
# raw_join.where((col("order_datetime") == date_time.date()) & (col("user_id") == user))

In [143]:
# merchant_consumer_info.where((col("order_datetime") == date_time.date()) & (col("user_id") == user))