In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests

In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Joining")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

24/09/29 21:15:32 WARN Utils: Your hostname, LAPTOP-406UJ3L3 resolves to a loopback address: 127.0.1.1; using 172.21.136.1 instead (on interface eth0)
24/09/29 21:15:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/29 21:15:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/29 21:15:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/29 21:15:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/09/29 21:15:34 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
# Information on merchants
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

                                                                                

In [4]:
merchant_df = merchant.toPandas()

In [5]:
merchant_df.goods.value_counts()

goods
digital goods: books, movies, music                                                190
artist supply and craft shops                                                      187
computer programming , data processing, and integrated systems design services     182
shoe shops                                                                         182
gift, card, novelty, and souvenir shops                                            178
                                                                                  ... 
digital goods: books,  movies, music                                                 1
computer programming , data processing,  and integrated systems design services      1
motor vehicle  supplies and new parts                                                1
computers, computer peripheral equipment, and  software                              1
lawn and garden supply outlets,  including nurseries                                 1
Name: count, Length: 126, dtype: int6

Let's do the preprocessing to the `goods` column again.

In [6]:
import re

In [7]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
    return text

merchant_df['str_goods'] = merchant_df['goods'].apply(clean_text)
merchant_df['tokens'] = merchant_df['str_goods'].apply(lambda x: x.split())
stop_words = set(['is', 'a', 'the', 'for', 'and', 'to', 'of', 'this', 'except']) 
merchant_df['clean_goods'] = merchant_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


merchant_df['clean_goods_str'] = merchant_df['clean_goods'].apply(lambda x: ' '.join(x))

In [8]:
merchant_df.clean_goods.value_counts()

clean_goods
[digital, goods, books, movies, music]                                              195
[artist, supply, craft, shops]                                                      193
[computer, programming, data, processing, integrated, systems, design, services]    191
[shoe, shops]                                                                       185
[furniture, home, furnishings, equipment, shops, manufacturers, appliances]         182
[gift, card, novelty, souvenir, shops]                                              182
[computers, computer, peripheral, equipment, software]                              181
[florists, supplies, nursery, stock, flowers]                                       180
[tent, awning, shops]                                                               178
[cable, satellite, other, pay, television, radio, services]                         175
[watch, clock, jewelry, repair, shops]                                              170
[bicycle, shops, sal

- This is how we manully categorize all merchant's goods into 5 segments: hobby, technology, beauty, furniture, others.

In [9]:
segments = ["hobby", "hobby", "technology", "beauty", "furniture", "others", "technology", "hobby", "furniture", "technology", "beauty", "others", "hobby", "hobby", "beauty", "others", "furniture", "others", "others", "hobby", "furniture", "furniture", "technology", "hobby", "beauty"]


In [10]:
res = {list(merchant_df.clean_goods_str.value_counts().index)[i]: segments[i] for i in range(len(segments))}

In [11]:
res

{'digital goods books movies music': 'hobby',
 'artist supply craft shops': 'hobby',
 'computer programming data processing integrated systems design services': 'technology',
 'shoe shops': 'beauty',
 'furniture home furnishings equipment shops manufacturers appliances': 'furniture',
 'gift card novelty souvenir shops': 'others',
 'computers computer peripheral equipment software': 'technology',
 'florists supplies nursery stock flowers': 'hobby',
 'tent awning shops': 'furniture',
 'cable satellite other pay television radio services': 'technology',
 'watch clock jewelry repair shops': 'beauty',
 'bicycle shops sales service': 'others',
 'music shops musical instruments pianos sheet music': 'hobby',
 'books periodicals newspapers': 'hobby',
 'health beauty spas': 'beauty',
 'stationery office supplies printing writing paper': 'others',
 'lawn garden supply outlets including nurseries': 'furniture',
 'motor vehicle supplies new parts': 'others',
 'opticians optical goods eyeglasses': '

In [14]:
merchant_df['segment']  = merchant_df['clean_goods_str'].map(lambda x: res[x])

In [16]:
merchant_df.segment.value_counts()

segment
hobby         1153
others         815
furniture      776
technology     672
beauty         610
Name: count, dtype: int64

In [25]:
merchant_df[['merchant_abn', 'segment']].to_parquet("../data/curated/merchant_segment.parquet")

## Ranking with respect to each segment

In [26]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/merchant_segment.parquet")

In [30]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

### Furniture ranking

In [42]:
furniture_ranking = merchant_segment_ranking.filter(F.col('segment') == 'furniture')
furniture_ranking.orderBy(F.col('final_score').desc()).limit(10)

merchant_abn,final_score,segment
13467303030,0.3281230182131976,furniture
64203420245,0.326409616515149,furniture
49891706470,0.3263890377098256,furniture
89726005175,0.3261686352587278,furniture
91923722701,0.2937247609301727,furniture
38090089066,0.2916727664199687,furniture
96152467973,0.2827674246801805,furniture
19933438190,0.2819137313056254,furniture
98269572896,0.2806839775247866,furniture
57900494384,0.2805736754622056,furniture


## Hobby ranking

In [43]:
hobby_ranking = merchant_segment_ranking.filter(F.col('segment') == 'hobby')
hobby_ranking.orderBy(F.col('final_score').desc()).limit(10)

merchant_abn,final_score,segment
24852446429,0.4110443467625267,hobby
43186523025,0.4106041323976168,hobby
64403598239,0.3420927049596617,hobby
95824231566,0.3293725083219758,hobby
63290521567,0.3230966721419867,hobby
72472909171,0.3066058849859542,hobby
49212265466,0.3032468438815935,hobby
21772962346,0.2984426515506033,hobby
76314317957,0.2966010302358934,hobby
47797405944,0.2843235374538184,hobby


### Technology ranking

In [44]:
tech_ranking = merchant_segment_ranking.filter(F.col('segment') == 'technology')
tech_ranking.orderBy(F.col('final_score').desc()).limit(10)

merchant_abn,final_score,segment
68216911708,0.3247475833723059,technology
21439773999,0.3137262497809363,technology
62694031334,0.2693580010545016,technology
84703983173,0.2370334038443454,technology
94690988633,0.2273246461468774,technology
35909341340,0.2220438771039997,technology
67400260923,0.2206914879348324,technology
56946407125,0.2096748271795812,technology
17488304283,0.2075670234998554,technology
29521780474,0.2068175880456991,technology


### Beauty

In [45]:
beauty_ranking = merchant_segment_ranking.filter(F.col('segment') == 'beauty')
beauty_ranking.orderBy(F.col('final_score').desc()).limit(10)

merchant_abn,final_score,segment
86578477987,0.376599777618594,beauty
49322182190,0.3071275062334952,beauty
71528203369,0.2858212325465878,beauty
23338656015,0.2777591471090938,beauty
80779820715,0.2701832189517648,beauty
81761494572,0.2684067763293901,beauty
29616684420,0.2631044164983196,beauty
91720867026,0.2437628800645514,beauty
34920839316,0.2319530570909032,beauty
35733444320,0.230424293449239,beauty


### Others

In [46]:
other_ranking = merchant_segment_ranking.filter(F.col('segment') == 'others')
other_ranking.orderBy(F.col('final_score').desc()).limit(10)

merchant_abn,final_score,segment
45629217853,0.3262225286311107,others
80324045558,0.3257948358836089,others
46804135891,0.3220495013488866,others
94493496784,0.3103409695966315,others
79417999332,0.3096272770478116,others
60956456424,0.304325908719992,others
32361057556,0.3013897190374569,others
48534649627,0.2758585621963855,others
81219314324,0.2535747278549654,others
11566786699,0.2360601261472915,others
