Preprocessing Data:

In [3]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import re
import glob
import math
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
from datetime import datetime, timedelta

In [4]:
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.executor.memory", "15g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .appName('prprocess') \
    .getOrCreate()

your 131072x1 screen size is bogus. expect trouble
24/09/03 21:22:18 WARN Utils: Your hostname, DESKTOP-F216TKE resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/03 21:22:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/03 21:22:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Consumer Details Dataframe Preprocess

In [5]:
consumer_details = spark.read.parquet(".././data/tables/consumer_user_details.parquet")
consumer_details.show()

                                                                                

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
|      6|     407340|
|      7|     511685|
|      8|     448088|
|      9|     650435|
|     10|    1058499|
|     11|     428325|
|     12|    1494640|
|     13|    1146717|
|     14|    1343547|
|     15|    1463076|
|     16|    1356405|
|     17|    1331093|
|     18|      80965|
|     19|    1226530|
|     20|    1390367|
+-------+-----------+
only showing top 20 rows



In [6]:
consumer_details.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)



Change user_id and consumer_id to string type 

In [7]:
consumer_details = consumer_details.withColumn('user_id', F.col('user_id').cast('string'))
consumer_details = consumer_details.withColumn('consumer_id', F.col('consumer_id').cast('string'))

In [8]:
consumer_details.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- consumer_id: string (nullable = true)



Look for Dupilcate Values

In [9]:
consumer_details.count()

499999

In [10]:
consumer_details.dropDuplicates().count()

                                                                                

499999

No duplicate values have been found

Check for Missing Values 

In [11]:
consumer_details.filter(F.isnull('consumer_id')).count()


0

In [12]:
consumer_details.filter(F.isnan('consumer_id')).count()

0

In [13]:
consumer_details.filter(F.isnull('user_id')).count()

0

In [14]:
consumer_details.filter(F.isnan('user_id')).count()

0

In [15]:
#consumer_details.write.mode('overwrite').parquet('.././data/curated/consumer_details')

In [16]:
consumer_fraud = spark.read.option("header", True).csv(".././data/tables/consumer_fraud_probability.csv")
consumer_fraud.show()

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19| 97.6298077657765|
|  21419|    2021-12-10|99.24738020302328|
|   5606|    2021-10-17|84.05825045251777|
|   3101|    2021-04-17|91.42192091901347|
|  22239|    2021-10-19|94.70342477508035|
|  16556|    2022-02-20|89.65663294494827|
|  10278|    2021-09-28|83.59136689427714|
|  15790|    2021-12-30|71.77065889280253|
|   5233|    2021-08-29|85.87123303878818|
|    230|    2021-08-28|86.28328808934151|
|  13601|    2021-12-26|83.13696487489679|
|   6383|    2021-09-15| 66.2676451623754|
|   3513|    2022-02-27|75.16981192247916|
|  18658|    2021-10-19|82.98609082999361|
|   5965|    2021-11-14|69.37164467869053|
|  18714|    2021-11-14|83.78813794627237|
|  22957|    2022-02-12|82.79065699075498|
|  20118|    2021-09-05|80.34030486265003|
|   6436|    2021-12-24|84.81618344606828|
|  17900|    2022-02-25|92.73262811161372|
+-------+--

In [17]:
consumer_fraud.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [18]:
consumer_fraud = consumer_fraud.withColumn('order_datetime', F.col('order_datetime').cast('date'))

In [19]:
consumer_fraud = consumer_fraud.withColumn('fraud_probability', F.col('fraud_probability').cast('double'))

In [20]:
consumer_fraud.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)



Check the range for fraude probability

In [21]:
consumer_fraud.filter(F.col('fraud_probability') < 0).show()

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
+-------+--------------+-----------------+



In [22]:
consumer_fraud.filter(F.col('fraud_probability') > 100).show()

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
+-------+--------------+-----------------+



The ranges of fraud probability lies in the correct range

Ensure all user_id's are properly referenced 

In [28]:
userids = consumer_details.select(F.col('user_id')).toPandas()['user_id']

In [47]:
userids = list(userids)
userids


['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '13

In [51]:
consumer_details.filter(F.col('user_id') == '6228').show()

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|   6228|    1242133|
+-------+-----------+



In [35]:
@F.udf(LongType())
def userID(x):
    if (x not in userids):
        return 0
    else:
        return 1
    
    

In [54]:
referenced = consumer_fraud.withColumn('Referenced',userID( F.col('user_id')))

In [56]:
referenced.filter(F.col('Referenced') == 0).show()

[Stage 47:>                                                         (0 + 1) / 1]

+-------+--------------+-----------------+----------+
|user_id|order_datetime|fraud_probability|Referenced|
+-------+--------------+-----------------+----------+
+-------+--------------+-----------------+----------+



                                                                                