In [13]:
import sys
sys.path.append('../scripts/')
from utility import read_file, create_folder, temp_record_query, temp_record_sdf

In [14]:
from pyspark.sql import SparkSession

In [15]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

### 1. Connect Transactions and Merchants AS `sdf`

In [16]:
print("================ Transactions ================")
# transactions_sdf = read_file(spark, 'transactions_20210228_20210827_snapshot')
transactions_sdf = read_file(spark, 'transactions_20210828_20220227_snapshot')
# transactions_sdf = read_file(spark, 'transactions_20220228_20220828_snapshot')
temp = transactions_sdf.dropna(how='any').count()
print( f'There are {transactions_sdf.count()} data, and {temp} of them have no null value.')

|> Loading File...
|> Loading Finished!
-RECORD 0----------------------------------------------
 user_id        | 14935                                
 merchant_abn   | 79417999332                          
 dollar_value   | 136.06570809815838                   
 order_id       | 23acbb7b-cf98-4580-9775-86b8e0a2bd88 
 order_datetime | 2021-11-26                           
only showing top 1 row



                                                                                

There are 4508106 data, and 4508106 of them have no null value.


In [17]:
print("================ Merchant ================")
merchants_sdf = read_file(spark, 'merchants_data.parquet', '../data/curated/', truncate=20)
temp = merchants_sdf.dropna(how='any').count()
print( f'There are {merchants_sdf.count()} data, and {temp} of them have no null value.')

|> Loading File...
|> Loading Finished!
-RECORD 0-----------------------------
 merchant_name | Felis Limited        
 tags          | furniture, home f... 
 merchant_abn  | 10023283211          
 take_rate     | 0.18                 
 type          | e                    
only showing top 1 row

There are 4026 data, and 4026 of them have no null value.


- Known that Transactions and Merchants table have no null value

In [18]:
sdf = transactions_sdf.join(merchants_sdf, how='left', on='merchant_abn')
temp = sdf.dropna(how='any', subset=['merchant_name']).count()
print( f'There are {sdf.count()} data, and {temp} of them have no null value.')

There are 4508106 data, and 4323692 of them have no null value.


- After connect the Transactions and Merchants Table by `merchant_abn`, we find that some Transactions not from the merchants in the table. Because they are not in our list of rank therefore, delete those null value.

In [19]:
# sdf = sdf.dropna(how='any', subset=['merchant_name'])

- There are 4026 merchants, and for `transactions_20210228_20210827_snapshot` have 3643266 records. 

    After combination, there are **3494038** rest. **3643266 - 3494038 = 149228** records has been deleted. 
    
    `149228 / 3643266 = 0.04095995186736297`

In [20]:
sdf.show(5)

+------------+-------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+
|merchant_abn|user_id|      dollar_value|            order_id|order_datetime|       merchant_name|                tags|take_rate|type|
+------------+-------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+
| 79417999332|  14935|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|Phasellus At Company|gift, card, novel...|     4.95|   b|
| 46451548968|      1| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|Tempus Eu Ligula ...|health and beauty...|     6.04|   a|
| 89518629617|  14936|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|Vulputate Velit E...|tent and awning s...|     3.09|   c|
| 49167531725|      1| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|     Felis Institute|digital goods: bo...|     6.42|   a|
| 31101120643|  14936|25.228114942417797|8e301c0f-06ab-

### 2. `sdf` join with Customer

In [21]:
print("================ Customer ================")
customer_sdf = read_file(spark, 'new_consumer_data.parquet', '../data/curated/')
temp = customer_sdf.dropna(how='any').count()
print( f'There are {customer_sdf.count()} data, and {temp} of them have no null value.')

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------
 consumer_id | 680810                     
 name        | Courtney Mendez            
 address     | 802 Knapp Harbors Apt. 769 
 state       | QLD                        
 postcode    | 9013                       
 gender      | Female                     
 user_id     | 71144                      
only showing top 1 row

There are 499999 data, and 499999 of them have no null value.


In [22]:
sdf = sdf.join(customer_sdf, how='left', on='user_id')
temp = sdf.dropna(how='any', subset=['consumer_id']).count()
print( f'There are {sdf.count()} data, and {temp} of them have no null value.')

There are 4508106 data, and 4508106 of them have no null value.


In [23]:
sdf.show(5)



+-------+------------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+-----------+---------------+--------------------+-----+--------+------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|       merchant_name|                tags|take_rate|type|consumer_id|           name|             address|state|postcode|gender|
+-------+------------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+-----------+---------------+--------------------+-----+--------+------+
|  14935| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|Phasellus At Company|gift, card, novel...|     4.95|   b|    1059280|  Cameron Adams|0280 Carpenter Lodge|  QLD|    4563|  Male|
|  14935| 72472909171|52.973387548612614|01d968a6-dde1-48c...|    2021-12-01|   Nullam Consulting|digital goods: bo...|     6.33|   a|    1059280|  Cameron Adams|0280 Carpenter

                                                                                

### 3. `sdf` join with the external dataset  --- rrm

In [24]:
print("================ RRM_External ================")
rrm_sdf = read_file(spark, 'external_rrm.parquet', '../data/curated/', truncate=20)
temp = rrm_sdf.dropna(how='any').count()
print( f'There are {rrm_sdf.count()} data, and {temp} of them have no null value.')

|> Loading File...
|> Loading Finished!
-RECORD 0----------
 Date | 2020-02-21 
 AUS  | 1          
 NSW  | 1          
 VIC  | 1          
 QLD  | -1         
 SA   | 2          
 WA   | 2          
 TAS  | -1         
 NT   | 4          
 ACT  | 4          
only showing top 1 row

There are 761 data, and 761 of them have no null value.


In [25]:
sdf = sdf.join(rrm_sdf, sdf.order_datetime == rrm_sdf.Date, 'left')
temp = sdf.dropna(how='any', subset=['Date']).count()
print( f'There are {sdf.count()} data, and {temp} of them have no null value.')

There are 4508106 data, and 4508106 of them have no null value.


                                                                                

In [26]:
sdf.show(5)

+-------+------------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+-----------+---------------+--------------------+-----+--------+------+----------+---+---+---+---+---+---+---+---+---+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|       merchant_name|                tags|take_rate|type|consumer_id|           name|             address|state|postcode|gender|      Date|AUS|NSW|VIC|QLD| SA| WA|TAS| NT|ACT|
+-------+------------+------------------+--------------------+--------------+--------------------+--------------------+---------+----+-----------+---------------+--------------------+-----+--------+------+----------+---+---+---+---+---+---+---+---+---+
|  14935| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|Phasellus At Company|gift, card, novel...|     4.95|   b|    1059280|  Cameron Adams|0280 Carpenter Lodge|  QLD|    4563|  Male|2021-11-26| -1|-10| -9|  5|  8| 16| 

In [27]:
sdf = sdf.drop('Date')

- 处理 external table 的信息，只保留其所对应的洲的数值

In [28]:
import pandas as pd

In [29]:
cols = ['state', 'AUS', 'NSW', 'VIC', 'QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT']
temp_df = sdf.select(['order_id'] + cols).toDF('order_id', 'state', 'AUS', 'NSW', 'VIC', 'QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT')
temp_df = pd.DataFrame(temp_df.collect(), columns=['order_id'] + cols)

                                                                                

In [30]:
temp_df['target'] = temp_df.apply(lambda x: x[x['state']], axis=1)
temp_df = temp_df.drop(cols, axis=1)

In [31]:
temp_sdf = spark.createDataFrame(temp_df)
temp_sdf.show(5)

22/09/20 22:52:41 WARN TaskSetManager: Stage 133 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.


[Stage 133:>                                                        (0 + 1) / 1]

22/09/20 22:52:46 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 133 (TID 693): Attempting to kill Python Worker
+--------------------+------+
|            order_id|target|
+--------------------+------+
|23acbb7b-cf98-458...|     5|
|76bab304-fa2d-400...|    16|
|a2ae446a-2959-41c...|     8|
|7080c274-17f7-4cc...|    16|
|8e301c0f-06ab-45c...|     8|
+--------------------+------+
only showing top 5 rows



                                                                                

In [32]:
sdf = sdf.join(temp_sdf, on='order_id', how='left')

In [33]:
sdf.show(5)

[Stage 134:>  (1 + 7) / 8][Stage 135:>  (0 + 1) / 8][Stage 136:>  (0 + 0) / 1]8]

22/09/20 22:52:47 WARN TaskSetManager: Stage 135 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+--------------------+-------+------------+------------------+--------------+--------------------+--------------------+---------+----+-----------+----------------+--------------------+-----+--------+------+---+---+---+---+---+---+---+---+---+------+
|            order_id|user_id|merchant_abn|      dollar_value|order_datetime|       merchant_name|                tags|take_rate|type|consumer_id|            name|             address|state|postcode|gender|AUS|NSW|VIC|QLD| SA| WA|TAS| NT|ACT|target|
+--------------------+-------+------------+------------------+--------------+--------------------+--------------------+---------+----+-----------+----------------+--------------------+-----+--------+------+---+---+---+---+---+---+---+---+---+------+
|01d968a6-dde1-48c...|  14935| 72472909171|52.973387548612614|    2021-12-01|   Nullam Consulting|digital goods: bo...|     6.33|   a|    1059280|   Cameron Adams|0280 Carpenter Lodge|  QLD|    4563|  Male|  0| -7| -8|  6|  9| 17|  8| 18|  5|     6|


### 4. `sdf` join with the external dataset  --- ncd

In [34]:
print("================  ================")
ncd_sdf = read_file(spark, 'external_ncd.parquet', '../data/curated/')
temp = ncd_sdf.dropna(how='any').count()
print( f'There are {ncd_sdf.count()} data, and {temp} have no null value.')

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------
 Date            | 2020-01-25 
 New cases / day | 4          
only showing top 1 row

There are 959 data, and 959 have no null value.


In [35]:
ncd_sdf.show(5)

+----------+---------------+
|      Date|New cases / day|
+----------+---------------+
|2020-01-25|              4|
|2020-01-26|              0|
|2020-01-27|              1|
|2020-01-28|              0|
|2020-01-29|              4|
+----------+---------------+
only showing top 5 rows



In [36]:
sdf = sdf.join(ncd_sdf, sdf.order_datetime == ncd_sdf.Date, 'left')
sdf.show(5)

22/09/20 22:53:01 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 161:>  (1 + 7) / 8][Stage 162:>  (0 + 1) / 8][Stage 163:>  (0 + 0) / 1]

22/09/20 22:53:02 WARN TaskSetManager: Stage 162 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+--------------------+-------+------------+------------------+--------------+--------------------+--------------------+---------+----+-----------+----------------+--------------------+-----+--------+------+---+---+---+---+---+---+---+---+---+------+----------+---------------+
|            order_id|user_id|merchant_abn|      dollar_value|order_datetime|       merchant_name|                tags|take_rate|type|consumer_id|            name|             address|state|postcode|gender|AUS|NSW|VIC|QLD| SA| WA|TAS| NT|ACT|target|      Date|New cases / day|
+--------------------+-------+------------+------------------+--------------+--------------------+--------------------+---------+----+-----------+----------------+--------------------+-----+--------+------+---+---+---+---+---+---+---+---+---+------+----------+---------------+
|01d968a6-dde1-48c...|  14935| 72472909171|52.973387548612614|    2021-12-01|   Nullam Consulting|digital goods: bo...|     6.33|   a|    1059280|   Cameron Adams|0280 C

In [37]:
temp = sdf.dropna(how='any', subset=['Date']).count()
print( f'There are {sdf.count()} data, and {temp} have no null value.')

22/09/20 22:53:15 WARN TaskSetManager: Stage 180 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/09/20 22:53:35 WARN TaskSetManager: Stage 193 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.




There are 4508106 data, and 4508106 have no null value.


                                                                                

In [38]:
sdf = sdf.drop('Date')

In [39]:
# temp = sdf.to_pandas_on_spark()
# temp[temp.Date.isna()]

In [40]:
path = '../data/curated/data'
if (create_folder(path)):
    sdf.write.partitionBy('order_datetime').parquet(path, mode='append')
else:
    sdf.write.partitionBy('order_datetime').parquet(path, mode='overwrite')


|> Create Successfully!
22/09/20 22:53:53 WARN TaskSetManager: Stage 208 contains a task of very large size (24542 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

### 5. `sdf` join with the fraud datasets