## This notebook is for the Ranking of the merchants

### 0 - Import modules and begin Spark

In [1]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/10/07 21:39:34 WARN Utils: Your hostname, LAPTOP-1A92TAQQ resolves to a loopback address: 127.0.1.1; using 172.18.166.45 instead (on interface eth0)
24/10/07 21:39:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/07 21:39:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/07 21:39:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## 1 - Read in data

In [3]:
# Grouped by merchant transaction data
revenue_sdf = (spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", ",")   
    .csv("../data/curated/revenue.csv"))

# Top 200 merchants with the highest number of unique customers
customer_sdf = spark.read.parquet("../data/top_200_customer/*")

# All transactions data with the tags cleaned
trans_final_sdf = spark.read.parquet('../data/merged/merged_transactions_with_tags.parquet')

# Forcasted revenue data
revenue_modelled_sdf = spark.read.parquet('../data/revenue_modelled/forecast/*')

                                                                                

In [4]:
# Forcasted number of customer data
customer_modelled_sdf = spark.read.parquet('../data/customer/*')

                                                                                

In [5]:
revenue_modelled_sdf.show(5, truncate=False)

customer_modelled_sdf.show(5, truncate=False)

trans_final_sdf.show(5, truncate=False)

24/10/07 21:41:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-------

                                                                                

### 2 - Prepare to rank the merchants

In [8]:
# Get the latest dates from the data
latest_date_cust = customer_modelled_sdf.select(F.max("order_datetime")).collect()[0][0]
latest_date_trans = trans_final_sdf.select(F.max("order_datetime")).collect()[0][0]
print([latest_date_cust, latest_date_trans])

                                                                                

[datetime.date(2022, 10, 26), datetime.date(2022, 10, 26)]


In [9]:
# Count the number of merchants of the latest date for the forcasted number of customer data
merchant_abn_list = customer_modelled_sdf.filter(customer_modelled_sdf["order_datetime"]\
                                                  == latest_date_cust).select("merchant_abn").distinct()
merchant_abn_list.count()

                                                                                

2204

#### 2.1 - Get the latest customer information for each merchant

In [15]:
# Get number of total cumulative customers, and number of new unique customers for each merchant
from pyspark.sql import Window

window_spec = Window.partitionBy("merchant_abn").orderBy(F.col("order_datetime").desc())

merchant_df_with_row = customer_modelled_sdf.withColumn("row_num", F.row_number().over(window_spec))

latest_df = merchant_df_with_row.filter(F.col("row_num") == 1).drop("row_num")

latest_df.show()
latest_df.count()  # Check that information is for all available merchants (4026 total merchants)

                                                                                

+------------+--------------+--------------------------+--------------------------+
|merchant_abn|order_datetime|total_cumulative_customers|total_new_unique_customers|
+------------+--------------+--------------------------+--------------------------+
| 10023283211|    2022-10-26|                      3032|                        10|
| 10142254217|    2022-10-26|                      2849|                         3|
| 10165489824|    2022-09-18|                         5|                         1|
| 10187291046|    2022-10-25|                       335|                         1|
| 10192359162|    2022-10-26|                       383|                         1|
| 10206519221|    2022-10-26|                      7900|                        12|
| 10255988167|    2022-10-26|                       816|                         2|
| 10264435225|    2022-10-26|                      4535|                         5|
| 10279061213|    2022-10-26|                       549|                    

                                                                                

4026

### 2.2 - Extract and calculate heuristics for the ranking

In [10]:
from pyspark.sql.window import Window

# 1) Get Take Rate and Industry Segment from trans_final_sdf
trans_data = trans_final_sdf \
    .select("merchant_abn", "merchant_name", "take_rate", "industry_segment") \
    .join(merchant_abn_list, "merchant_abn", "inner") \
    .dropDuplicates(["merchant_abn"])

# 2) Calculate Fraud Rate from revenue_sdf
revenue_data = revenue_sdf \
    .withColumn("fraud_rate", F.col("fraud_count") / F.col("num_trans")) \
    .select("merchant_abn", "fraud_rate") \
    .join(merchant_abn_list, "merchant_abn", "inner") \
    .dropDuplicates(["merchant_abn"])

# 3) Cumulative Revenue and Transactions over the Last 6 Months from revenue_sdf
current_date = latest_date_cust
final_date = '2023-05-01'

# Calculate the revenue earned between current_date and final_date for each merchant
revenue_earned_sdf = revenue_modelled_sdf.withColumn(
        'revenue_earned',
        F.col(str(final_date)) - F.col(str(current_date))
    ).select('merchant_abn', 'revenue_earned') \
    .join(merchant_abn_list, "merchant_abn", "inner") 


# 4) Total Customers at Latest Date in customer_sdf
latest_customers = customer_modelled_sdf \
    .filter(customer_modelled_sdf["order_datetime"] == latest_date_cust) \
    .select("merchant_abn", "total_cumulative_customers")

# 6) Calculate Industry Segment Popularity from trans_final_sdf

total_merchants = trans_final_sdf.select("merchant_abn").distinct().count()

industry_segment_popularity = trans_final_sdf \
    .groupBy("industry_segment") \
    .agg(F.countDistinct("merchant_abn").alias("num_merchants")) \
    .withColumn("segment_popularity", F.col("num_merchants") / total_merchants) \
    .select("industry_segment", "segment_popularity")

print(merchant_abn_list.select("merchant_abn").count())

print(trans_data.select("merchant_abn").count())

print("Revenue Data:")
revenue_data.show(5, truncate=False)
print(revenue_data.select("merchant_abn").count())

print("Revenue Earned:")
revenue_earned_sdf.show(5, truncate=False)
print(revenue_earned_sdf.select("merchant_abn").count())

print("Revenue Growth:")
revenue_growth_sdf.show(5, truncate=False)
print(revenue_growth_sdf.select("merchant_abn").count())

print("Latest Customers:")
latest_customers.show(5, truncate=False)
print(latest_customers.select("merchant_abn").count())


print("Industry Segment Popularity:")
industry_segment_popularity.show(5, truncate=False)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `fraud_count` cannot be resolved. Did you mean one of the following? [`�   3pJ�L`, `PAR1�*�%����<�  �`, `��5_   �VkǮ�s���}�Ψ�.#�B�k�F������ s���Y�`].;
'Project [PAR1�*�%����<�  �#17, ��5_   �VkǮ�s���}�Ψ�.#�B�k�F������ s���Y�#18, �   3pJ�L#19, ('fraud_count / 'num_trans) AS fraud_rate#11484]
+- Relation [PAR1�*�%����<�  �#17,��5_   �VkǮ�s���}�Ψ�.#�B�k�F������ s���Y�#18,�   3pJ�L#19] csv


In [35]:
six_months_ago = F.add_months(F.lit(latest_date_trans), -6)

df = spark.createDataFrame([(1,)], ['dummy'])

result = df.select(six_months_ago.alias("six_months_ago")).collect()

six_months_ago_value = result[0]["six_months_ago"]

revenue_growth_sdf = revenue_modelled_sdf.withColumn(
        'revenue_growth',
        (F.col(str(final_date)) - F.col(str(latest_date_trans)))/(F.col(str(latest_date_trans)) - F.col(str(six_months_ago_value))) - 1) \
    .select('merchant_abn', 'revenue_growth') \
    .join(merchant_abn_list, "merchant_abn", "inner")

print("Revenue Growth:")
revenue_growth_sdf.show(5, truncate=False)
print(revenue_growth_sdf.select("merchant_abn").count())

Revenue Growth:


                                                                                

+------------+-------------------+
|merchant_abn|revenue_growth     |
+------------+-------------------+
|90348927972 |NULL               |
|65668855732 |-0.6327736410827338|
|48723861274 |-0.6536755093930808|
|77343745836 |-0.6624027182079653|
|71649111610 |-0.6592345134620997|
+------------+-------------------+
only showing top 5 rows





263


                                                                                

In [36]:
# 7) Combine all into final DataFrame
final_df = trans_data \
    .join(revenue_data, "merchant_abn", "inner") \
    .join(revenue_earned_sdf, "merchant_abn", "inner") \
    .join(latest_customers, "merchant_abn", "inner") \
    .join(industry_segment_popularity, "industry_segment", "inner") \
    .join(revenue_growth_sdf, "merchant_abn", "inner") \

print(final_df.select("merchant_abn").count())



263


                                                                                

### 2.3 - Normalizing the heuristics

In [37]:
# Define a list of columns to normalize (excluding 'merchant_abn')
columns_to_normalize = final_df.columns
columns_string = ["merchant_abn", "industry_segment", "merchant_name"]
columns_to_normalize = list(set(columns_to_normalize) - set(columns_string))

# Calculate min and max for each column
min_max_values = {col: final_df.agg(F.min(col).alias("min"), F.max(col).alias("max")).first() for col in columns_to_normalize}

# Create a new DataFrame with normalized values
df_normalized = final_df

for col in columns_to_normalize:
    min_val = min_max_values[col]["min"]
    max_val = min_max_values[col]["max"]
    
    # Normalize the column
    df_normalized = df_normalized.withColumn(
        f"{col}_normalized",
        (F.col(col) - min_val) / (max_val - min_val)
    )

    print(f"{col} has been normalized")



revenue_earned has been normalized
segment_popularity has been normalized
take_rate has been normalized
total_cumulative_customers has been normalized
fraud_rate has been normalized
revenue_growth has been normalized


                                                                                

### 3 - Rank the merchants

In [39]:
import numpy as np

# Apply weights
weights = {
    'take_rate': 0.34,
    'fraud_rate': 0.27,
    "revenue_earned": 0.21,
    "revenue_growth": 0.05,
    "total_cumulative_customers": 0.09,
    "segment_popularity": 0.04,
}

df_weighted = df_normalized.withColumn(
    'weighted_fraud_rate', F.col('take_rate_normalized') * weights['fraud_rate']
).withColumn(
    'weighted_take_rate', F.col('fraud_rate_normalized') * weights['take_rate']
).withColumn(
    'weighted_revenue', F.col('revenue_earned_normalized') * weights['revenue_earned']
).withColumn(
    'weighted_rg', F.col('revenue_growth_normalized') * weights['revenue_growth']
).withColumn(
    'weighted_customers', F.col('total_cumulative_customers_normalized') *\
          weights['total_cumulative_customers']
).withColumn(
    'weighted_segment', F.col('segment_popularity_normalized') * weights['segment_popularity']
)

# Compute Ideal and Negative Ideal Solutions
ideal_solution = {
    'fraud_rate': df_weighted.agg(F.min('weighted_fraud_rate')).collect()[0][0],
    'take_rate': df_weighted.agg(F.max('weighted_take_rate')).collect()[0][0],
    'revenue': df_weighted.agg(F.max('weighted_revenue')).collect()[0][0],
    'growth': df_weighted.agg(F.max('weighted_rg')).collect()[0][0],
    'customers': df_weighted.agg(F.max('weighted_customers')).collect()[0][0],
    'segment': df_weighted.agg(F.min('weighted_segment')).collect()[0][0]
}

negative_ideal_solution = {
    'fraud_rate': df_weighted.agg(F.max('weighted_fraud_rate')).collect()[0][0],
    'take_rate': df_weighted.agg(F.min('weighted_take_rate')).collect()[0][0],
    'revenue': df_weighted.agg(F.min('weighted_revenue')).collect()[0][0],
    'growth': df_weighted.agg(F.min('weighted_rg')).collect()[0][0],
    'customers': df_weighted.agg(F.min('weighted_customers')).collect()[0][0],
    'segment': df_weighted.agg(F.max('weighted_segment')).collect()[0][0]
}

# Compute distances from Ideal and Negative Ideal Solutions
df_topsis = df_weighted.withColumn(
    'D_positive', F.sqrt(
        (F.col('weighted_fraud_rate') - ideal_solution['fraud_rate']) ** 2 +
        (F.col('weighted_take_rate') - ideal_solution['take_rate']) ** 2 +
        (F.col('weighted_revenue') - ideal_solution['revenue']) ** 2 +
        (F.col('weighted_rg') - ideal_solution['growth']) ** 2 +
        (F.col('weighted_customers') - ideal_solution['customers']) ** 2 +
        (F.col('weighted_segment') - ideal_solution['segment']) ** 2
    )
).withColumn(
    'D_negative', F.sqrt(
        (F.col('weighted_fraud_rate') - negative_ideal_solution['fraud_rate']) ** 2 +
        (F.col('weighted_take_rate') - negative_ideal_solution['take_rate']) ** 2 +
        (F.col('weighted_revenue') - negative_ideal_solution['revenue']) ** 2 + 
        (F.col('weighted_rg') - negative_ideal_solution['growth']) ** 2 +
        (F.col('weighted_customers') - negative_ideal_solution['customers']) ** 2 +
        (F.col('weighted_segment') - negative_ideal_solution['segment']) ** 2

    )
)

# Compute closeness and rank merchants
df_topsis = df_topsis.withColumn(
    'closeness', F.col('D_negative') / (F.col('D_positive') + F.col('D_negative'))
).orderBy(F.desc('closeness'))

df_topsis.show(5, truncate = False)

                                                                                

+------------+---------------------------+-------------------+---------+------------------+------------------+--------------------------+-------------------+-------------------+-------------------------+-----------------------------+--------------------+-------------------------------------+---------------------+-------------------------+--------------------+------------------+-------------------+--------------------+---------------------+---------------------+-------------------+-------------------+-------------------+
|merchant_abn|industry_segment           |merchant_name      |take_rate|fraud_rate        |revenue_earned    |total_cumulative_customers|segment_popularity |revenue_growth     |revenue_earned_normalized|segment_popularity_normalized|take_rate_normalized|total_cumulative_customers_normalized|fraud_rate_normalized|revenue_growth_normalized|weighted_fraud_rate |weighted_take_rate|weighted_revenue   |weighted_rg         |weighted_customers   |weighted_segment     |D_positiv

##### 3.1 - Find top 5 merchants for each industry segment

In [40]:
window_spec = Window.partitionBy("industry_segment").orderBy(F.desc("closeness"))  # Replace 'closeness' with your ranking metric

ranked_df = df_topsis \
    .withColumn("rank", F.row_number().over(window_spec))

top_merchants = ranked_df.filter(F.col("rank") <= 5)

top_merchants.select("merchant_abn", "industry_segment", "closeness", "rank").show(truncate=False)


                                                                                

+------------+---------------------------+-------------------+----+
|merchant_abn|industry_segment           |closeness          |rank|
+------------+---------------------------+-------------------+----+
|86145109204 |health + beauty            |0.45121464877292555|1   |
|67797658023 |health + beauty            |0.42105292513016324|2   |
|87566366459 |health + beauty            |0.4161619181556025 |3   |
|74019238521 |health + beauty            |0.3983365904607799 |4   |
|77343745836 |health + beauty            |0.3938940482065893 |5   |
|43083074133 |luxury products + services |0.7761815493767446 |1   |
|98202651678 |luxury products + services |0.45572699132198985|2   |
|27851049264 |luxury products + services |0.45182055696578916|3   |
|72870379863 |luxury products + services |0.4474521321613469 |4   |
|59589609053 |luxury products + services |0.43368223679866297|5   |
|54702673300 |rental services            |0.42485612113624593|1   |
|96834893748 |rental services            |0.4203

### 4 - Save the data

In [41]:
df_topsis.write.mode('overwrite').parquet('../data/curated/final_rankings.parquet')

                                                                                