# Market segmentation: manual ranking
In this notebook, we manually identify market areas using the tags associated with merchants. A top 10 merchants ranking is produced for each segment identified.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests

In [2]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Joining")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/10/18 23:04:20 WARN Utils: Your hostname, DESKTOP-702MS12 resolves to a loopback address: 127.0.1.1; using 172.17.210.38 instead (on interface eth0)
24/10/18 23:04:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/18 23:04:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/18 23:04:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Information on merchants
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

                                                                                

In [4]:
merchant_df = merchant.toPandas()

                                                                                

In [5]:
merchant_df.goods.value_counts()

goods
digital goods: books, movies, music                                                190
artist supply and craft shops                                                      187
computer programming , data processing, and integrated systems design services     182
shoe shops                                                                         182
gift, card, novelty, and souvenir shops                                            178
                                                                                  ... 
digital goods: books,  movies, music                                                 1
computer programming , data processing,  and integrated systems design services      1
motor vehicle  supplies and new parts                                                1
computers, computer peripheral equipment, and  software                              1
lawn and garden supply outlets,  including nurseries                                 1
Name: count, Length: 126, dtype: int6

Let's do the preprocessing to the `goods` column again.

In [6]:
import re

In [7]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
    return text

merchant_df['str_goods'] = merchant_df['goods'].apply(clean_text)
merchant_df['tokens'] = merchant_df['str_goods'].apply(lambda x: x.split())
stop_words = set(['is', 'a', 'the', 'for', 'and', 'to', 'of', 'this', 'except']) 
merchant_df['clean_goods'] = merchant_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


merchant_df['clean_goods_str'] = merchant_df['clean_goods'].apply(lambda x: ' '.join(x))

In [8]:
merchant_df.clean_goods.value_counts()

clean_goods
[digital, goods, books, movies, music]                                              195
[artist, supply, craft, shops]                                                      193
[computer, programming, data, processing, integrated, systems, design, services]    191
[shoe, shops]                                                                       185
[furniture, home, furnishings, equipment, shops, manufacturers, appliances]         182
[gift, card, novelty, souvenir, shops]                                              182
[computers, computer, peripheral, equipment, software]                              181
[florists, supplies, nursery, stock, flowers]                                       180
[tent, awning, shops]                                                               178
[cable, satellite, other, pay, television, radio, services]                         175
[watch, clock, jewelry, repair, shops]                                              170
[bicycle, shops, sal

- This is how we manully categorize all merchant's goods into 5 segments.

In [9]:
segments = [
    "Entertainment & Media", "Entertainment & Media", "Technology", "Beauty", 
    "Office & Home Supplies", "Miscellaneous", "Technology", 
    "Entertainment & Media", "Office & Home Supplies", "Technology", 
    "Beauty", "Miscellaneous", "Entertainment & Media", 
    "Entertainment & Media", "Beauty", "Office & Home Supplies", 
    "Office & Home Supplies", "Miscellaneous", "Miscellaneous", 
    "Entertainment & Media", "Office & Home Supplies", 
    "Office & Home Supplies", "Technology", "Entertainment & Media", "Beauty"
]


In [10]:
segments_dict = {list(merchant_df.clean_goods_str.value_counts().index)[i]: segments[i] for i in range(len(segments))}

In [11]:
segments_dict

{'digital goods books movies music': 'Entertainment & Media',
 'artist supply craft shops': 'Entertainment & Media',
 'computer programming data processing integrated systems design services': 'Technology',
 'shoe shops': 'Beauty',
 'furniture home furnishings equipment shops manufacturers appliances': 'Office & Home Supplies',
 'gift card novelty souvenir shops': 'Miscellaneous',
 'computers computer peripheral equipment software': 'Technology',
 'florists supplies nursery stock flowers': 'Entertainment & Media',
 'tent awning shops': 'Office & Home Supplies',
 'cable satellite other pay television radio services': 'Technology',
 'watch clock jewelry repair shops': 'Beauty',
 'bicycle shops sales service': 'Miscellaneous',
 'music shops musical instruments pianos sheet music': 'Entertainment & Media',
 'books periodicals newspapers': 'Entertainment & Media',
 'health beauty spas': 'Beauty',
 'stationery office supplies printing writing paper': 'Office & Home Supplies',
 'lawn garden s

In [12]:
merchant_df['segment']  = merchant_df['clean_goods_str'].map(lambda x: segments_dict[x])

In [13]:
merchant_df.segment.value_counts()

segment
Entertainment & Media     1153
Office & Home Supplies     937
Technology                 672
Miscellaneous              654
Beauty                     610
Name: count, dtype: int64

In [14]:
merchant_df[['merchant_abn', 'segment', 'goods']].to_parquet("../data/curated/merchant_segment.parquet")

**Ranking with respect to each segment**

In [15]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/merchant_segment.parquet")

In [16]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

In [17]:
segments = [
    "Entertainment & Media",
    "Office & Home Supplies",
    "Miscellaneous",
    "Beauty",
    "Technology"
]

for segment in segments:
    print(segment)
    segment_ranking = merchant_segment_ranking.filter(F.col('segment') == segment)
    segment_ranking.orderBy(F.col('final_score').desc()).show(10)

Entertainment & Media
+------------+------------------+--------------------+--------------------+
|merchant_abn|       final_score|             segment|               goods|
+------------+------------------+--------------------+--------------------+
| 63290521567|0.6726340419120908|Entertainment & M...|artist supply and...|
| 95824231566|0.6717981482748292|Entertainment & M...|art dealers and g...|
| 72472909171|0.6629478835183298|Entertainment & M...|digital goods: bo...|
| 64403598239|0.6603104474705002|Entertainment & M...|music shops - mus...|
| 49505931725|0.6350675863404268|Entertainment & M...|digital goods: bo...|
| 47086412084|0.6131917566949479|Entertainment & M...|digital goods: bo...|
| 98973094975|0.6083415618974026|Entertainment & M...|hobby, toy and ga...|
| 24852446429|0.6065823800542874|Entertainment & M...|florists supplies...|
| 43186523025|0.6061470687265811|Entertainment & M...|florists supplies...|
| 75034515922|0.5787559827897312|Entertainment & M...|digital good