In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [None]:
sdf = spark.read.parquet("../data/curated/full_data_without_fraud/")

# discard fraud transactions
sdf = sdf.filter(F.col('is_fraud')==0)
sdf.count()

In [None]:
# split the dataset 
train_sdf = sdf.filter((F.col('order_datetime') >= '2021-02-28') & (F.col('order_datetime') < '2021-08-28'))
label_sdf = sdf.filter((F.col('order_datetime') >= '2022-02-28') & (F.col('order_datetime') < '2022-08-28'))

In [None]:
train_sdf.count(), label_sdf.count()

### Created Features
The following features are created by aggregating records from 2021-2 to 2021-8 for each merchant. They are used as features in the model.
1. Total number of consumers
2. Average transaction dollar value
3. Total number of transactions
4. Mean income of consumers
5. revenue level
6. BNPL revenue = take rate * total transaction
7. Number of distinct postcode
8. Tag (one hot encoding)

The following features are created by aggregating records from 2022-2 to 2022-8 for each merchant. These features are to be predicted by the model and are used as features in the final ranking system.
1. Total number of consumers
2. BNPL revenue
3. Total number of transactions

In [None]:
train_data = train_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('total_num_consumer'),
         F.mean('dollar_value').alias('avg_dollar_value'),
         F.countDistinct('order_id').alias('total_num_transaction'),
         F.mean('mean_total_income').alias('mean_income'),
         F.first('revenue_level').alias('revenue_level'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('total_revenue'),
         F.countDistinct('postcode').alias('total_num_postcode'),
         F.first('tags').alias('tag'),
      )
train_data

In [None]:
label = label_sdf.groupBy('merchant_abn')\
      .agg(
         F.countDistinct('consumer_id').alias('y_total_num_consumer'),
         F.sum(F.col('dollar_value') * F.col('take_rate')).alias('y_total_revenue'),
         F.countDistinct('order_id').alias('y_total_num_transaction')
      )

In [None]:
train_data.count(), label.count() 

In [None]:
train_data = train_data.join(label, ["merchant_abn"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
train_data

In [None]:
train_data.write.format('parquet').mode('overwrite').save("../data/curated/train_data")