In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/10/05 13:43:27 WARN Utils: Your hostname, Xiaotongs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.0.57 instead (on interface en0)
22/10/05 13:43:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/05 13:43:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
sdf = spark.read.parquet("../data/curated/full_data_without_fraud/")
sdf.printSchema()
sdf.count()

AnalysisException: Path does not exist: file:/Users/xiaotongwang/Desktop/generic-buy-now-pay-later-project-group-14/data/curated/full_data_with_fraud

In [None]:
# discard fraud transactions
sdf = sdf.filter(F.col('is_fraud')==0)
sdf.count()

13221647

In [None]:
# split the dataset 
train_sdf = sdf.filter((F.col('order_datetime') >= '2021-02-28') & (F.col('order_datetime') < '2021-08-28'))
label_sdf = sdf.filter((F.col('order_datetime') >= '2022-02-28') & (F.col('order_datetime') < '2022-08-28'))

In [None]:
train_sdf.count(), label_sdf.count()

(3457142, 4119273)

### Created Features
The following features are created by aggregating records from 2021-2 to 2021-8 for each merchant. They are used as features in the model.
1. Total number of consumers
2. Average transaction dollar value
3. Total number of transactions
4. Mean income of consumers
5. revenue level
6. BNPL revenue = take rate * total transaction
7. Number of distinct postcode
8. Tag (one hot encoding)

The following features are created by aggregating records from 2022-2 to 2022-8 for each merchant. These features are to be predicted by the model and are used as features in the final ranking system.
1. Total number of consumers
2. BNPL revenue
3. Total number of transactions

In [None]:
train_data = train_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('total_num_consumer'),
         F.mean('dollar_value').alias('avg_dollar_value'),
         F.countDistinct('order_id').alias('total_num_transaction'),
         F.mean('mean_total_income').alias('mean_income'),
         F.first('revenue_level').alias('revenue_level'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('total_revenue'),
         F.countDistinct('postcode').alias('total_num_postcode'),
         F.first('tags').alias('tag'),
      )
train_data

merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag
10023283211,808,209.0251604007696,823,62506.642770352366,e,30964.988492208053,719,furniture
10142254217,731,41.1196593815928,750,62693.954666666665,b,130143.71547230528,642,cable
10187291046,87,111.0840871392216,87,61060.0459770115,b,31795.59789319502,87,watch
10192359162,107,451.1432080236007,107,65637.3831775701,a,305563.80254357896,107,music
10206519221,2244,39.20921658203574,2370,62122.52278481013,a,589149.8606977111,1619,gift
10255988167,218,389.5552654520502,218,63146.61926605504,b,366867.58137013006,211,computer
10264435225,1238,114.10783402533238,1272,62006.31132075472,c,346896.95929006604,1018,watch
10279061213,130,308.35150039873355,130,63676.28461538462,a,228889.32027512783,127,computer
10323485998,2475,128.98933167858434,2621,62792.28691339184,a,2234715.708497144,1730,furniture
10342410215,238,378.0169713940928,238,61579.6512605042,a,570397.3822040077,227,computer


In [None]:
label = label_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('y_total_num_consumer'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('y_total_revenue'),
         F.countDistinct('order_id').alias('y_total_num_transaction')
      )

In [None]:
train_data.count(), label.count() 

(3953, 3789)

In [None]:
train_data = train_data.join(label, ["merchant_abn"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
train_data

merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,y_total_num_consumer,y_total_revenue,y_total_num_transaction
10023283211,808,209.0251604007696,823,62506.642770352366,e,30964.988492208053,719,furniture,980.0,38940.2914088986,1002.0
10142254217,731,41.1196593815928,750,62693.954666666665,b,130143.71547230528,642,cable,918.0,154467.92091896123,925.0
10187291046,87,111.0840871392216,87,61060.0459770115,b,31795.59789319502,87,watch,99.0,41683.21121325837,100.0
10192359162,107,451.1432080236007,107,65637.3831775701,a,305563.80254357896,107,music,107.0,290071.0062352741,107.0
10206519221,2244,39.209216582035744,2370,62122.52278481013,a,589149.8606977111,1619,gift,2662.0,666766.4067054288,2811.0
10255988167,218,389.5552654520503,218,63146.61926605504,b,366867.5813701301,211,computer,235.0,378005.14673149673,236.0
10264435225,1238,114.10783402533238,1272,62006.31132075472,c,346896.9592900661,1018,watch,1519.0,435003.6795629894,1566.0
10279061213,130,308.35150039873355,130,63676.28461538462,a,228889.32027512783,127,computer,161.0,273146.77071188006,161.0
10323485998,2475,128.98933167858425,2621,62792.28691339184,a,2234715.708497144,1730,furniture,2974.0,2765582.918877613,3191.0
10342410215,238,378.0169713940928,238,61579.6512605042,a,570397.3822040078,227,computer,273.0,661666.9517852827,275.0


In [None]:
train_data.write.format('parquet').mode('overwrite').save("../data/curated/train_data")