In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import shapefile as shp
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import sys
sys.path.append('../scripts')
from preprocess_script import count_outliers

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Dataset Joining")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/09/11 07:41:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Dataset downloaded from https://data.gov.au/dataset/ds-dga-2c79581f-600e-4560-80a8-98adb1922dfc/details?q=correspondence%20asgs, data dictionary also in the same.

In [None]:
correspondence_df = spark.read.csv('../data/tables/correspondence/CG_POSTCODE_2021_SA2_2021.csv', header=True)
correspondence_df.show()

In [None]:
preferred_SA2 = correspondence_df.groupBy('POSTCODE').agg(F.max('RATIO_FROM_TO'))
preferred_SA2.show()

In [None]:
preferred_SA2 = preferred_SA2.withColumnRenamed("max(RATIO_FROM_TO)", "RATIO_FROM_TO")

In [None]:
preferred_SA2.show()
preferred_SA2.count()
preferred_SA2.dropna()
preferred_SA2.count()

In [None]:
preferred_SA2 = preferred_SA2.join(correspondence_df, on=["POSTCODE", "RATIO_FROM_TO"], how = "left")

In [None]:
preferred_SA2.show()

                                                                                                                                 

In [None]:
preferred_SA2.select('POSTCODE', 'SA2_CODE_2021')

In [None]:
# Removing Null values
preferred_SA2 = preferred_SA2.na.drop(how = 'any')

In [None]:
# Information on consumer
consumer_user_detail = spark.read.parquet("../data/tables/part_1/consumer_user_details.parquet")
consumer = spark.read.option("delimiter" , "|").csv("../data/tables/part_1/tbl_consumer.csv", header = "True")

# Information on consumer's fraud probability
consumer_fraud_prob = spark.read.csv("../data/tables/part_1/consumer_fraud_probability.csv", header = "True")

In [None]:
consumer_user_detail.show()

In [None]:
consumer.show()

                                                                                                                                 

In [None]:
consumer_fraud_prob.show()

In [None]:
consumer = consumer.join(consumer_user_detail, on = "consumer_id", how="left")

In [None]:
consumer = consumer.join(consumer_fraud_prob, on="user_id", how="left")

In [None]:
consumer.show()

                                                                                                                                 

In [None]:
consumer.schema

In [None]:
from pyspark.sql.types import DoubleType
consumer = consumer.withColumn('fraud_probability', F.col('fraud_probability').cast(DoubleType()))

In [None]:
# Filling 0 for fraud probability null.
consumer = consumer.fillna(0, subset=['fraud_probability'])

                                                                                                                                 

In [None]:
max_fraud = consumer.groupby('user_id').agg(F.max('fraud_probability'))





                                                                                                                                 

[Stage 697:>                                                                                                         (0 + 4) / 4][Stage 697:>                                        (0 + 4) / 4][Stage 698:>                                        (0 + 0) / 1]

                                                                                                                                 

In [None]:
max_fraud.show()

+-------+----------------------+
|user_id|max(fraud_probability)|
+-------+----------------------+
|     26|    10.604536233608997|
|     29|    15.492409971599415|
|    474|     24.74251000337883|
|    964|    10.000639641611324|
|   1677|     10.69910130025277|
|   1697|                   0.0|
|   1806|                   0.0|
|   1950|    16.924789438758555|
|   2040|    19.150047883546396|
|   2214|      9.06669796313919|
|   2250|      25.5950368734215|
|   2453|                   0.0|
|   2509|     17.64221987321122|
|   2529|     16.84817579923174|
|   2927|    14.570508610444977|
|   3091|    15.428265394555776|
|   3506|    30.199921043599648|
|   3764|      9.72214044955335|
|   4590|     18.88641597368393|
|   4823|                   0.0|
+-------+----------------------+
only showing top 20 rows

+-------+----------------------+
|user_id|max(fraud_probability)|
+-------+----------------------+
|     26|    10.604536233608997|
|     29|    15.492409971599415|
|    474|     24.

[Stage 710:>                                                                                                         (0 + 0) / 1]                                                                                                                                 

[Stage 730:>                                                                                                         (0 + 4) / 4]

                                                                                                                                 

[Stage 730:=====>              (1 + 3) / 4][Stage 732:>                   (0 + 1) / 1][Stage 733:>                   (0 + 1) / 2]                                                                                                                                 

In [None]:
consumer = consumer.join(max_fraud, on='user_id', how='left')
consumer.show()



                                                                                                                                 

+-------+-----------+-----------------+--------------------+-----+--------+-----------+--------------+------------------+----------------------+
|user_id|consumer_id|             name|             address|state|postcode|     gender|order_datetime| fraud_probability|max(fraud_probability)|
+-------+-----------+-----------------+--------------------+-----+--------+-----------+--------------+------------------+----------------------+
|      6|     407340|    Karen Chapman|2706 Stewart Oval...|  NSW|    2033|     Female|    2021-12-12|10.459280127078758|    10.459280127078758|
|      7|     511685|     Andrea Jones|   122 Brandon Cliff|  QLD|    4606|     Female|          NULL|               0.0|                   0.0|
|      9|     650435|  Stephanie Reyes|5813 Denise Land ...|  NSW|    2482|     Female|    2021-11-17| 8.531261989227714|     10.58055311139687|
|      9|     650435|  Stephanie Reyes|5813 Denise Land ...|  NSW|    2482|     Female|    2021-12-13| 10.58055311139687|     10.5

[Stage 786:>                                                                                                         (0 + 4) / 4]



                                                                                                                                 

[Stage 793:>                                                                                                         (0 + 4) / 4]



                                                                                                                                 

In [None]:
consumer = consumer.select('user_id', 'consumer_id', 'postcode', 'state', 'gender', 'max(fraud_probability)')
consumer.show()

                                                                                                                                 

[Stage 872:>                                                                                                         (0 + 4) / 4]

[Stage 872:>                   (0 + 4) / 4][Stage 873:>                   (0 + 0) / 2][Stage 874:>                   (0 + 0) / 2]

[Stage 872:=====>              (1 + 3) / 4][Stage 873:>                   (0 + 1) / 2][Stage 874:>                   (0 + 0) / 2][Stage 872:=====>              (1 + 3) / 4][Stage 873:>                   (0 + 2) / 2][Stage 874:>                   (0 + 0) / 2]                                                                                                                                 

[Stage 876:>                                                                                                         (0 + 1) / 1]                                                                                                                                 

[Stage 880:>                                        (0 + 1) / 1][Stage 881:>                                        (0 + 3) / 4]

                                                                                                                                 

+-------+-----------+--------+-----+-----------+----------------------+
|user_id|consumer_id|postcode|state|     gender|max(fraud_probability)|
+-------+-----------+--------+-----+-----------+----------------------+
|      6|     407340|    2033|  NSW|     Female|    10.459280127078758|
|      7|     511685|    4606|  QLD|     Female|                   0.0|
|      9|     650435|    2482|  NSW|     Female|     10.58055311139687|
|      9|     650435|    2482|  NSW|     Female|     10.58055311139687|
| 142344|     389526|    1143|  NSW|       Male|                   0.0|
| 142348|     553507|    4723|  QLD|Undisclosed|                   0.0|
| 142352|    1220814|    6479|   WA|     Female|                   0.0|
| 142354|     338380|    3138|  VIC|       Male|                   0.0|
| 142357|     746099|    3226|  VIC|     Female|                   0.0|
| 142359|     895156|    3173|  VIC|     Female|                   0.0|
| 284819|     562577|    2175|  NSW|       Male|                

[Stage 881:>                                        (0 + 4) / 4][Stage 882:>                                        (0 + 0) / 4]

[Stage 881:>                   (0 + 4) / 4][Stage 882:>                   (0 + 0) / 4][Stage 883:>                   (0 + 0) / 1]

[Stage 881:=====>              (1 + 3) / 4][Stage 882:=====>              (1 + 1) / 4][Stage 883:>                   (0 + 0) / 1]                                                                                                                                 



[Stage 900:>                                                                                                         (0 + 4) / 4]







[Stage 903:>                                                                                                         (0 + 4) / 4]

[Stage 903:>                                        (0 + 4) / 4][Stage 906:>                                        (0 + 0) / 1]



                                                                                                                                 



                                                                                                                                 

[Stage 918:>                                                                                                         (0 + 4) / 4]







[Stage 919:>                                        (0 + 4) / 4][Stage 922:>                                        (0 + 0) / 1]



In [None]:
consumer = consumer.sort('user_id')





[Stage 985:>                                                                                                         (0 + 4) / 4][Stage 985:>                                        (0 + 4) / 4][Stage 986:>                                        (0 + 0) / 4]

[Stage 985:=====>              (1 + 3) / 4][Stage 986:>                   (0 + 1) / 4][Stage 987:>                   (0 + 0) / 4]



[Stage 986:>                   (0 + 4) / 4][Stage 987:>                   (0 + 0) / 4][Stage 988:>                   (0 + 0) / 4]





[Stage 987:>                                        (0 + 4) / 4][Stage 988:>                                        (0 + 0) / 4]

[Stage 987:>                   (0 + 4) / 4][Stage 988:>                   (0 + 0) / 4][Stage 991:>                   (0 + 0) / 4]







[Stage 991:>                                                                                                         (0 + 4) / 4]                                                                                                                                 

[Stage 998:>                                                                                                         (0 + 4) / 4]



                                                                                                                                 

In [None]:
consumer.show()



[Stage 1006:>                  (0 + 4) / 4][Stage 1007:>                  (0 + 0) / 1][Stage 1008:>                  (0 + 0) / 1]

[Stage 1006:====>              (1 + 3) / 4][Stage 1007:>                  (0 + 1) / 1][Stage 1008:>                  (0 + 0) / 1]



[Stage 1013:>                                                                                                        (0 + 4) / 4]

                                                                                                                                 

[Stage 1014:>                                                                                                        (0 + 4) / 4]

[Stage 1014:>                  (0 + 4) / 4][Stage 1015:>                  (0 + 0) / 4][Stage 1016:>                  (0 + 0) / 1]

[Stage 1014:====>              (1 + 3) / 4][Stage 1015:>                  (0 + 1) / 4][Stage 1016:>                  (0 + 0) / 1]



[Stage 1015:>                  (0 + 4) / 4][Stage 1016:>                  (0 + 0) / 1][Stage 1017:>                  (0 + 0) / 1]

[Stage 1015:====>              (1 + 3) / 4][Stage 1016:>                  (0 + 1) / 1][Stage 1017:>                  (0 + 0) / 1]                                                                                                                                 

[Stage 1022:>                                                                                                        (0 + 4) / 4]                                                                                                                                 

+-------+-----------+--------+-----+-----------+----------------------+
|user_id|consumer_id|postcode|state|     gender|max(fraud_probability)|
+-------+-----------+--------+-----+-----------+----------------------+
|      1|    1195503|    6935|   WA|     Female|     9.805431136520959|
|      2|     179208|    2782|  NSW|     Female|    10.069850934775245|
|      2|     179208|    2782|  NSW|     Female|    10.069850934775245|
|      3|    1194530|     862|   NT|     Female|     8.300636455314633|
|      4|     154128|    2780|  NSW|     Female|     9.633302411090419|
|      5|     712975|    6355|   WA|     Female|    27.496186536467164|
|      5|     712975|    6355|   WA|     Female|    27.496186536467164|
|      5|     712975|    6355|   WA|     Female|    27.496186536467164|
|      6|     407340|    2033|  NSW|     Female|    10.459280127078758|
|      7|     511685|    4606|  QLD|     Female|                   0.0|
|      8|     448088|    6056|   WA|       Male|                

[Stage 1023:>                                       (0 + 4) / 4][Stage 1024:>                                       (0 + 0) / 4]

[Stage 1023:>                  (0 + 4) / 4][Stage 1024:>                  (0 + 0) / 4][Stage 1025:>                  (0 + 0) / 1]

[Stage 1023:====>              (1 + 3) / 4][Stage 1024:>                  (0 + 1) / 4][Stage 1025:>                  (0 + 0) / 1]

[Stage 1024:>                                       (0 + 4) / 4][Stage 1025:>                                       (0 + 0) / 1]

                                                                                                                                 



[Stage 1038:>                                                                                                        (0 + 4) / 4]

[Stage 1038:>                  (0 + 4) / 4][Stage 1039:>                  (0 + 0) / 2][Stage 1040:>                  (0 + 0) / 1]

[Stage 1038:====>              (1 + 3) / 4][Stage 1039:>                  (0 + 1) / 2][Stage 1040:>                  (0 + 0) / 1]                                                                                                                                 

[Stage 1042:>                  (0 + 4) / 4][Stage 1043:>                  (0 + 0) / 2][Stage 1044:>                  (0 + 0) / 4]

[Stage 1042:====>              (1 + 3) / 4][Stage 1043:>                  (0 + 1) / 2][Stage 1044:>                  (0 + 0) / 4]

                                                                                                                                 

[Stage 1044:>                                                                                                        (0 + 4) / 4][Stage 1044:>                                       (0 + 4) / 4][Stage 1047:>                                       (0 + 0) / 4]

[Stage 1044:>                  (0 + 4) / 4][Stage 1047:>                  (0 + 0) / 4][Stage 1048:>                  (0 + 0) / 4][Stage 1044:====>              (1 + 3) / 4][Stage 1047:>                  (0 + 1) / 4][Stage 1048:>                  (0 + 0) / 4]





[Stage 1048:>                                                                                                        (0 + 4) / 4]

[Stage 1048:>                                       (0 + 4) / 4][Stage 1049:>                                       (0 + 0) / 1]

                                                                                                                                 



[Stage 1052:>                                                                                                        (0 + 4) / 4]









In [None]:
consumer.write.mode('overwrite').parquet('../data/curated/customer/')



[Stage 1098:>                                       (0 + 4) / 4][Stage 1099:>                                       (0 + 0) / 4]



[Stage 1099:>                                                                                                        (0 + 4) / 4]





[Stage 1102:>                                                                                                        (0 + 4) / 4]



[Stage 1105:>                                                                                                        (0 + 4) / 4]

[Stage 1109:>                                                                                                        (0 + 4) / 4]



                                                                                                                                 

[Stage 1120:>                                       (0 + 4) / 4][Stage 1121:>                                       (0 + 0) / 4]





[Stage 1121:>                                                                                                        (0 + 4) / 4]





                                                                                                                                 

[Stage 1128:>                                                                                                        (0 + 4) / 4]





                                                                                                                                 