In [11]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [12]:
sp = SparkSession.builder.appName("Checker").getOrCreate()
sp

In [13]:
trans = sp.read.option("inferSchema", True).parquet("../data/curated/transactions")
trans.show(3)

+-------+------------+------------+--------------+-----------+
|user_id|merchant_abn|dollar_value|order_datetime|   order_id|
+-------+------------+------------+--------------+-----------+
|  14935| 79417999332|      136.07|    2021-11-26|68719476736|
|      1| 46451548968|       72.62|    2021-11-26|68719476737|
|  14936| 89518629617|        3.08|    2021-11-26|68719476738|
+-------+------------+------------+--------------+-----------+
only showing top 3 rows



In [14]:
trans.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- order_id: long (nullable = true)



### Checking the customers that exist in fraud data

In [16]:
model_data = sp.read.option("inferSchema", True).parquet("../data/processed/model_data")
model_data.show(2)

+-------+------------+------------+--------------+
|user_id|merchant_abn|dollar_value|order_datetime|
+-------+------------+------------+--------------+
|  14936| 89518629617|        3.08|    2021-11-26|
|  14936| 31101120643|       25.23|    2021-11-26|
+-------+------------+------------+--------------+
only showing top 2 rows



In [17]:
cfraud = sp.read.option("inferSchema", True).parquet("../data/curated/customer_fraud")
cfraud.show()

+-------+-------------------+-----------------+
|user_id|     order_datetime|fraud_probability|
+-------+-------------------+-----------------+
|   6228|2021-12-19 00:00:00|         97.62981|
|  21419|2021-12-10 00:00:00|         99.24738|
|   5606|2021-10-17 00:00:00|         84.05825|
|   3101|2021-04-16 23:00:00|         91.42192|
|  22239|2021-10-19 00:00:00|         94.70342|
|  16556|2022-02-20 00:00:00|         89.65663|
|  10278|2021-09-27 23:00:00|         83.59137|
|  15790|2021-12-30 00:00:00|         71.77066|
|   5233|2021-08-28 23:00:00|         85.87123|
|    230|2021-08-27 23:00:00|         86.28329|
|  13601|2021-12-26 00:00:00|         83.13696|
|   6383|2021-09-14 23:00:00|         66.26765|
|   3513|2022-02-27 00:00:00|        75.169815|
|  18658|2021-10-19 00:00:00|         82.98609|
|   5965|2021-11-14 00:00:00|         69.37164|
|  18714|2021-11-14 00:00:00|         83.78814|
|  22957|2022-02-12 00:00:00|         82.79066|
|  20118|2021-09-04 23:00:00|          8

In [19]:
from pyspark.sql.types import DateType

cfraud = cfraud.withColumn("order_datetime", col("order_datetime").cast(DateType()))
cfraud.printSchema()
cfraud.show(2)

root
 |-- user_id: integer (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: float (nullable = true)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19|         97.62981|
|  21419|    2021-12-10|         99.24738|
+-------+--------------+-----------------+
only showing top 2 rows



In [26]:
legit_fraud_custs = cfraud.join(model_data.select("user_id").distinct(), on="user_id")
legit_fraud_custs.show(2)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|     26|    2021-11-19|        10.604536|
|     29|    2021-10-31|          8.44397|
+-------+--------------+-----------------+
only showing top 2 rows



In [27]:
legit_fraud_custs.count()

11605

In [28]:
cfraud_pd = cfraud.toPandas()
legit_fraud_pd = legit_fraud_custs.toPandas()

legit_fraud_pd.head()

Unnamed: 0,user_id,order_datetime,fraud_probability
0,26,2021-11-19,10.604536
1,29,2021-10-31,8.44397
2,29,2021-11-25,15.49241
3,15057,2022-01-11,36.562668
4,15237,2022-01-09,8.632889


In [29]:
import seaborn as sns

sns.displot(cfraud["fraud_probability"])

TypeError: object of type 'Column' has no len()

In [None]:
s