<h1>Creating Table for Ranking<h1>

In [None]:
from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession.builder.appName("Ranking Features")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
transaction_data = spark.read.parquet('../data/curated/cleaned_data.parquet')
merchant_fraud_rates = spark.read.csv('../data/curated/merchant_fraud_rate.csv', header=True)
consumer_fraud_rates = spark.read.csv('../data/curated/consumer_fraud_rate.csv', header=True)
future_profit = spark.read.csv('../data/curated/future_predictions.csv', header=True)

In [None]:
transaction_data = transaction_data.join(consumer_fraud_rates, ['user_id'])

In [None]:
merchant_properties = transaction_data.groupBy("merchant_abn").agg(\
    F.avg('Median_age_persons').alias("avg_expected_customer_age"),\
    F.avg('fraud_rate').alias("avg_customer_fraud_rate")\
)

In [None]:
future_profit = future_profit.drop("_c0")

In [None]:
merchant_ranking_properties = merchant_properties.join(future_profit, ['merchant_abn']).join(merchant_fraud_rates, ['merchant_abn'])

In [None]:
merchant_ranking_properties.write.mode('overwrite').option('header', True).csv('../data/curated/merchant_ranking_properties.csv')