# Manual segment ranking

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Joining")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

In [3]:
# Information on merchants
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

In [4]:
merchant_df = merchant.toPandas()

In [None]:
merchant_df.goods.value_counts()

Let's do the preprocessing to the `goods` column again.

In [6]:
import re

In [7]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
    return text

merchant_df['str_goods'] = merchant_df['goods'].apply(clean_text)
merchant_df['tokens'] = merchant_df['str_goods'].apply(lambda x: x.split())
stop_words = set(['is', 'a', 'the', 'for', 'and', 'to', 'of', 'this', 'except']) 
merchant_df['clean_goods'] = merchant_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


merchant_df['clean_goods_str'] = merchant_df['clean_goods'].apply(lambda x: ' '.join(x))

In [None]:
merchant_df.clean_goods.value_counts()

- This is how we manully categorize all merchant's goods into 5 segments.

In [19]:
segments = [
    "Entertainment & Media", "Entertainment & Media", "Technology", "Beauty", 
    "Office & Home Supplies", "Miscellaneous", "Technology", 
    "Entertainment & Media", "Office & Home Supplies", "Technology", 
    "Beauty", "Miscellaneous", "Entertainment & Media", 
    "Entertainment & Media", "Beauty", "Office & Home Supplies", 
    "Office & Home Supplies", "Miscellaneous", "Miscellaneous", 
    "Entertainment & Media", "Office & Home Supplies", 
    "Office & Home Supplies", "Technology", "Entertainment & Media", "Beauty"
]


In [20]:
segments_dict = {list(merchant_df.clean_goods_str.value_counts().index)[i]: segments[i] for i in range(len(segments))}

In [None]:
segments_dict

In [22]:
merchant_df['segment']  = merchant_df['clean_goods_str'].map(lambda x: segments_dict[x])

In [None]:
merchant_df.segment.value_counts()

In [31]:
merchant_df[['merchant_abn', 'segment', 'goods']].to_parquet("../data/curated/merchant_segment.parquet")

## Ranking with respect to each segment

In [32]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/merchant_segment.parquet")

In [33]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

In [None]:
segments = [
    "Entertainment & Media",
    "Office & Home Supplies",
    "Miscellaneous",
    "Beauty",
    "Technology"
]

for segment in segments:
    print(segment)
    segment_ranking = merchant_segment_ranking.filter(F.col('segment') == segment)
    segment_ranking.orderBy(F.col('final_score').desc()).show(10)