In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.regression import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window

In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Initial Ranking Model")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "16G")
    .config("spark.executor.memory", "16G")
    .config("spark.sql.files.maxPartitionBytes", "64MB")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

24/10/05 04:49:05 WARN Utils: Your hostname, codespaces-c6855a resolves to a loopback address: 127.0.0.1; using 10.0.0.128 instead (on interface eth0)
24/10/05 04:49:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 04:49:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/05 04:49:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/10/05 04:49:06 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


We aim to create a function that will create the initial ranking. We will use the following:
- The earning growth rate
- The average customer fraud probability
- The merchant fraud probability
- The customer retention rate

We want all of these values to be between 0 and 1. Then we will give them a weight and sum them up to get the final ranking.

### Revenue Growth Rate

In [3]:
future_revenue = spark.read.parquet('../data/curated/predicted_monthly_revenue.parquet')
transactions = spark.read.parquet('../data/curated/all_details/')
merchant_monthly = transactions.groupBy(F.date_format(F.col('order_datetime'), 'yyyy-MM'), 'merchant_abn').agg(F.sum('dollar_value'))
merchant_monthly = merchant_monthly.withColumn('month', F.month(F.col('date_format(order_datetime, yyyy-MM)')))

In [4]:
# Creating a month since first transaction column for each merchant
merchant_monthly = merchant_monthly.withColumn('first_transaction', F.min('date_format(order_datetime, yyyy-MM)').over(Window.partitionBy('merchant_abn')))
merchant_monthly = merchant_monthly.withColumn('month_since_first_transaction', F.months_between(F.col('date_format(order_datetime, yyyy-MM)'), F.col('first_transaction')))
# Get the max month since first transaction for each merchant
max_month = merchant_monthly.groupBy('merchant_abn').agg(F.max('month_since_first_transaction').alias('max_month'))
# max_month.show()

In [5]:
merchant_monthly = merchant_monthly.join(max_month, 'merchant_abn')
# Filter out the merchants with max month since first transaction less than 12
merchant_monthly = merchant_monthly.filter(F.col('month_since_first_transaction') >= 12)
# Filter out the last 12 months of transactions for each merchant
merchant_monthly = merchant_monthly.filter(F.col('month_since_first_transaction') >= F.col('max_month') - 12)
# merchant_monthly.show()

24/10/05 04:49:20 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
# Sum the monthly revenue for each merchant
merchant_yearly = merchant_monthly.groupBy('merchant_abn').agg(F.sum('sum(dollar_value)').alias('yearly_revenue'))
# merchant_yearly.show()

In [None]:
future_revenue.show()

In [7]:
# Sum the predicted monthly revenue for each merchant
summed_future_revenue = future_revenue.groupBy('merchant_abn').agg(F.sum('prediction').alias('predicted_yearly_revenue'))
# summed_future_revenue.show()

In [8]:
# Add the take rate to summed future revenue
revenue = merchant_yearly.join(summed_future_revenue, 'merchant_abn')
# revenue.show()

In [9]:
revenue = revenue.distinct()
revenue = revenue.withColumn('growth_rate', (F.col('predicted_yearly_revenue') - F.col('yearly_revenue')) / F.col('yearly_revenue'))
# revenue.show()

### Customer Fraud Probability

In [10]:
customer_fraud = spark.read.parquet('../data/curated/predicted_consumer_fraud/*')
# customer_fraud.show()

In [None]:
transactions.join(customer_fraud, 'order_id').show()

In [11]:
# Aggregate the average consumer fraud for each merchant
merchant_agg_fraud = transactions.groupby('merchant_abn').agg(F.avg('consumer_fraud').alias('avg_consumer_fraud'))

In [None]:
merchant_agg_fraud.show()

In [12]:
merchant_fraud = spark.read.parquet('../data/curated/predicted_merchant_fraud/*')

In [None]:
merchant_fraud.show()

In [13]:
order_abn = transactions.select('order_id', 'merchant_abn').distinct()

In [14]:
merchant_fraud = merchant_fraud.join(order_abn, on='order_id', how='left')

In [15]:
# Drop the order_id column
merchant_fraud = merchant_fraud.drop('order_id')
# merchant_fraud.show()

### Customer Retention Rate

In [16]:
customer_retention = spark.read.parquet('../data/curated/customer_retention/*')
# customer_retention.show()

## Creating the initial ranking


In [17]:
merchant_fraud = merchant_fraud.distinct()

In [None]:
# count the number of rows for each dataset before joining
print('Number of rows in revenue:', revenue.count())
print('Number of rows in merchant fraud:', merchant_fraud.count())
print('Number of rows in customer retention:', customer_retention.count())
print('Number of rows in customer fraud:', merchant_agg_fraud.count())

We can see that the number of records is same for all the datasets (4422) except for revenue dataset and merchant fraud. This is due to the fact that merchants with less than 12 months of data are not included in the revenue dataset. We will check the number of merchants in merchant fraud dataset.

In [None]:
# Counting the number of distinct merchants in merchant_fraud
print('Number of distinct merchants in merchant fraud:', merchant_fraud.select('merchant_abn').distinct().count())

In [None]:
# Showing the non distinct merchants in merchant_fraud
merchant_fraud.groupBy('merchant_abn').count().filter(F.col('count') > 1).show()

In [18]:
# Checking the values of merchant fraud for these non distinct merchants, we take the decision to take the average of the values
merchant_fraud = merchant_fraud.groupBy('merchant_abn').agg(F.avg('merchant_fraud').alias('merchant_fraud_prediction'))
# merchant_fraud.count()

Weights will be 0.25 for each of the four factors.

In [19]:
ranking = revenue.join(merchant_agg_fraud, on ='merchant_abn', how = 'inner')
ranking = ranking.join(merchant_fraud, on = 'merchant_abn', how = 'inner')
ranking = ranking.join(customer_retention, on = 'merchant_abn', how = 'inner')
# ranking.count()

In [20]:
# Scaling growth rate score to be between 0 and 1
min_growth_rate = ranking.agg(F.min('growth_rate')).collect()[0][0]
max_growth_rate = ranking.agg(F.max('growth_rate')).collect()[0][0]
ranking = ranking.withColumn('growth_rate_score', (F.col('growth_rate') - min_growth_rate) / (max_growth_rate - min_growth_rate))

                                                                                

In [None]:
ranking.show()

In [None]:
# Printing max and min values of growth rate score, average consumer fraud, merchant fraud and returning customer proportion to check if they are between 0 and 1
print('Max growth rate score:', ranking.agg(F.max('growth_rate_score')).collect()[0][0])
print('Min growth rate score:', ranking.agg(F.min('growth_rate_score')).collect()[0][0])
print('Max avg consumer fraud:', ranking.agg(F.max('avg_consumer_fraud')).collect()[0][0])
print('Min avg consumer fraud:', ranking.agg(F.min('avg_consumer_fraud')).collect()[0][0])
print('Max merchant fraud:', ranking.agg(F.max('merchant_fraud_prediction')).collect()[0][0])
print('Min merchant fraud:', ranking.agg(F.min('merchant_fraud_prediction')).collect()[0][0])
print('Max returning customer proportion:', ranking.agg(F.max('returning_customer_proportion')).collect()[0][0])
print('Min returning customer proportion:', ranking.agg(F.min('returning_customer_proportion')).collect()[0][0])

The consumer fraud probability and merchant fraud probability is to be divided by 100 to get the values between 0 and 1.

In [21]:
ranking = ranking.withColumn('final_score', (F.col('growth_rate_score') + (1-(F.col('avg_consumer_fraud')/100)) + (1-(F.col('merchant_fraud_prediction')/100)) + F.col('returning_customer_proportion'))/4)
# ranking.show()

In [22]:
ranking = ranking.select('merchant_abn', 'final_score')


In [23]:
ranking=ranking.distinct()

In [24]:
ranking.sort(F.col('final_score').desc()).show()

                                                                                

+------------+------------------+
|merchant_abn|       final_score|
+------------+------------------+
| 46804135891|0.6777120982876664|
| 63290521567|0.6749088816613451|
| 64203420245|0.6735086366160532|
| 49891706470|0.6734876409303723|
| 68216911708| 0.673419306638833|
| 45629217853|0.6733255685982173|
| 89726005175|0.6732569639090749|
| 80324045558|0.6729121388016536|
| 95824231566|0.6706274916780242|
| 21439773999|0.6700187523090408|
| 13467303030|0.6649252706700772|
| 72472909171|0.6622214852629547|
| 94493496784|0.6573385083234794|
| 79417999332|0.6566989590735425|
| 60956456424| 0.651398243090242|
| 32361057556|0.6484768972386095|
| 91923722701|0.6408525833479833|
| 64403598239|0.6365628297962749|
| 49505931725|0.6343411880850518|
| 48534649627|0.6314486821874987|
+------------+------------------+
only showing top 20 rows



In [25]:
ranking.write.mode('overwrite').parquet('../data/curated/merchant_ranking/')

                                                                                