# Manual segment ranking

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd

In [4]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Merchant Segment Ranking")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

24/10/05 04:53:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
# Information on merchants
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

In [6]:
merchant_df = merchant.toPandas()

In [7]:
merchant_df.goods.value_counts()

goods
digital goods: books, movies, music                                                190
artist supply and craft shops                                                      187
computer programming , data processing, and integrated systems design services     182
shoe shops                                                                         182
gift, card, novelty, and souvenir shops                                            178
                                                                                  ... 
digital goods: books,  movies, music                                                 1
computer programming , data processing,  and integrated systems design services      1
motor vehicle  supplies and new parts                                                1
computers, computer peripheral equipment, and  software                              1
lawn and garden supply outlets,  including nurseries                                 1
Name: count, Length: 126, dtype: int6

Let's do the preprocessing to the `goods` column again.

In [8]:
import re

In [9]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
    return text

merchant_df['str_goods'] = merchant_df['goods'].apply(clean_text)
merchant_df['tokens'] = merchant_df['str_goods'].apply(lambda x: x.split())
stop_words = set(['is', 'a', 'the', 'for', 'and', 'to', 'of', 'this', 'except']) 
merchant_df['clean_goods'] = merchant_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


merchant_df['clean_goods_str'] = merchant_df['clean_goods'].apply(lambda x: ' '.join(x))

In [10]:
merchant_df.clean_goods.value_counts()

clean_goods
[digital, goods, books, movies, music]                                              195
[artist, supply, craft, shops]                                                      193
[computer, programming, data, processing, integrated, systems, design, services]    191
[shoe, shops]                                                                       185
[furniture, home, furnishings, equipment, shops, manufacturers, appliances]         182
[gift, card, novelty, souvenir, shops]                                              182
[computers, computer, peripheral, equipment, software]                              181
[florists, supplies, nursery, stock, flowers]                                       180
[tent, awning, shops]                                                               178
[cable, satellite, other, pay, television, radio, services]                         175
[watch, clock, jewelry, repair, shops]                                              170
[bicycle, shops, sal

- This is how we manully categorize all merchant's goods into 5 segments.

In [11]:
segments = [
    "Entertainment & Media", "Entertainment & Media", "Technology", "Beauty", 
    "Office & Home Supplies", "Miscellaneous", "Technology", 
    "Entertainment & Media", "Office & Home Supplies", "Technology", 
    "Beauty", "Miscellaneous", "Entertainment & Media", 
    "Entertainment & Media", "Beauty", "Office & Home Supplies", 
    "Office & Home Supplies", "Miscellaneous", "Miscellaneous", 
    "Entertainment & Media", "Office & Home Supplies", 
    "Office & Home Supplies", "Technology", "Entertainment & Media", "Beauty"
]


In [12]:
segments_dict = {list(merchant_df.clean_goods_str.value_counts().index)[i]: segments[i] for i in range(len(segments))}

In [13]:
segments_dict

{'digital goods books movies music': 'Entertainment & Media',
 'artist supply craft shops': 'Entertainment & Media',
 'computer programming data processing integrated systems design services': 'Technology',
 'shoe shops': 'Beauty',
 'furniture home furnishings equipment shops manufacturers appliances': 'Office & Home Supplies',
 'gift card novelty souvenir shops': 'Miscellaneous',
 'computers computer peripheral equipment software': 'Technology',
 'florists supplies nursery stock flowers': 'Entertainment & Media',
 'tent awning shops': 'Office & Home Supplies',
 'cable satellite other pay television radio services': 'Technology',
 'watch clock jewelry repair shops': 'Beauty',
 'bicycle shops sales service': 'Miscellaneous',
 'music shops musical instruments pianos sheet music': 'Entertainment & Media',
 'books periodicals newspapers': 'Entertainment & Media',
 'health beauty spas': 'Beauty',
 'stationery office supplies printing writing paper': 'Office & Home Supplies',
 'lawn garden s

In [14]:
merchant_df['segment']  = merchant_df['clean_goods_str'].map(lambda x: segments_dict[x])

In [15]:
merchant_df.segment.value_counts()

segment
Entertainment & Media     1153
Office & Home Supplies     937
Technology                 672
Miscellaneous              654
Beauty                     610
Name: count, dtype: int64

In [16]:
merchant_df[['merchant_abn', 'segment', 'goods']].to_parquet("../data/curated/merchant_segment.parquet")

## Ranking with respect to each segment

In [17]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/merchant_segment.parquet")

In [18]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

In [19]:
segments = [
    "Entertainment & Media",
    "Office & Home Supplies",
    "Miscellaneous",
    "Beauty",
    "Technology"
]

for segment in segments:
    print(segment)
    segment_ranking = merchant_segment_ranking.filter(F.col('segment') == segment)
    segment_ranking.orderBy(F.col('final_score').desc()).show(10)

Entertainment & Media
+------------+------------------+--------------------+--------------------+
|merchant_abn|       final_score|             segment|               goods|
+------------+------------------+--------------------+--------------------+
| 63290521567|0.6749088816613451|Entertainment & M...|artist supply and...|
| 95824231566|0.6706274916780242|Entertainment & M...|art dealers and g...|
| 72472909171|0.6622214852629547|Entertainment & M...|digital goods: bo...|
| 64403598239|0.6365628297962749|Entertainment & M...|music shops - mus...|
| 49505931725|0.6343411880850518|Entertainment & M...|digital goods: bo...|
| 47086412084|0.6124809716947015|Entertainment & M...|digital goods: bo...|
| 98973094975|0.6085352007931438|Entertainment & M...|hobby, toy and ga...|
| 24852446429| 0.588872596759068|Entertainment & M...|florists supplies...|
| 43186523025|0.5884372854313616|Entertainment & M...|florists supplies...|
| 75034515922| 0.578029584534356|Entertainment & M...|digital good