In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
import string
from kmodes.kmodes import KModes
%matplotlib inline

In [13]:
data = pd.read_parquet("../data/tables/tbl_merchants.parquet")

# Remove revenue level and take rate
data['tags']= data['tags'].str[0:-25]

Unnamed: 0_level_0,name,tags
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1
10023283211,Felis Limited,"((furniture, home furnishings and equipment sh..."
10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a..."
10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]"
10187291046,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops]"
10192359162,Enim Condimentum PC,"([music shops - musical instruments, pianos, a..."
...,...,...
99938978285,Elit Dictum Eu Ltd,"[(opticians, optical goods, and eyeglasses)"
99974311662,Mollis LLP,"((books, periodicals, and newspapers)"
99976658299,Sociosqu Corp.,((shoe shops)
99987905597,Commodo Hendrerit LLC,[[motor vehicle Supplies and new parts]


In [3]:
STOPWORDS = set(stopwords.words('english'))

# Function for preprocessing tags text
def preprocess_text(x):
    
    # Lower case and remove all punctutations
    x = x.lower()
    x = x.translate(str.maketrans('', '', string.punctuation))

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    x = " ".join(lemmatizer.lemmatize(word) for word in x.split())

    # Remove stop words
    x =  ' '.join([word for word in x.split() if word not in STOPWORDS])
    return x

In [14]:
data['tags_clean'] = data['tags'].map(preprocess_text)
data

Unnamed: 0_level_0,name,tags,tags_clean
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10023283211,Felis Limited,"((furniture, home furnishings and equipment sh...",furniture home furnishing equipment shop manuf...
10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",cable satellite pay television radio service
10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]",jewelry watch clock silverware shop
10187291046,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops]",watch clock jewelry repair shop
10192359162,Enim Condimentum PC,"([music shops - musical instruments, pianos, a...",music shop musical instrument piano sheet music
...,...,...,...
99938978285,Elit Dictum Eu Ltd,"[(opticians, optical goods, and eyeglasses)",optician optical good eyeglass
99974311662,Mollis LLP,"((books, periodicals, and newspapers)",book periodical newspaper
99976658299,Sociosqu Corp.,((shoe shops),shoe shop
99987905597,Commodo Hendrerit LLC,[[motor vehicle Supplies and new parts],motor vehicle supply new part


In [12]:
# One Hot Encode tags
count_vectorizer = CountVectorizer(binary=True)
datavec  = count_vectorizer.fit_transform(data['tags_clean'])
count_array = datavec.toarray()
df = pd.DataFrame(data=count_array,columns = count_vectorizer.get_feature_names())

# sns.pairplot(df)
name = count_vectorizer.get_feature_names()



In [22]:
kmode = KModes(n_clusters=5, init = "random", n_init = 5, verbose=1)
kmode.fit_predict(data)
kmode_clusters = kmode.labels_

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 10989.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 20, cost: 11367.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 11193.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 11122.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 42, cost: 11110.0
Best run was number 1


In [6]:
df

Unnamed: 0,al,antique,appliance,art,artist,awning,beauty,bicycle,book,cable,...,supply,system,telecom,television,tent,tool,toy,vehicle,watch,writing
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4022,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4024,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [7]:
# list of tags
km = KMeans(n_clusters=5, random_state=0)
df_clust = km.fit_transform(df)
clusters = km.fit_predict(df)

In [8]:
n_clusters = [3,4,5]

for n in n_clusters:
    # fit K-Means clustering with different number of clusters
    km = KMeans(n_clusters=n, random_state=0)
    km.fit_predict(df)
    
    # evaluate silhouette score of each model
    score = silhouette_score(df, km.labels_, metric='euclidean')
    print(f'Silhouette score for {n} clusters: {score:.3f}')

Silhouette score for 3 clusters: 0.172
Silhouette score for 4 clusters: 0.216
Silhouette score for 5 clusters: 0.262


In [30]:
data['clusters'] = km.labels_
# print(data[data['clusters'] == 1])

data['kmode_clusters'] = kmode.labels_
print(data[data['kmode_clusters'] == 3])

                                    name  \
merchant_abn                               
10945019164               Nam Associates   
11121775571      Egestas Nunc Associates   
11285988014       Tincidunt Pede Company   
11981237946   Sed Molestie Sed Institute   
13136513766        Pede Nec Incorporated   
...                                  ...   
97353800246                  Enim Mi LLC   
98722839745     Interdum Ligula Eu Corp.   
99115883676               Non Enim Corp.   
99217762645                Non Ante Inc.   
99904689266                Dictum Eu Ltd   

                                                tags  \
merchant_abn                                           
10945019164   [(digital goods: books, movies, music)   
11121775571   [[digital goods: books, movies, music]   
11285988014   [[digital goods: books, moviEs, music]   
11981237946   [[digital goods: books, moVies, music]   
13136513766   [(digital goods: books, movies, music)   
...                                

In [10]:
cluster_mapping = {0: 'Gardening', 1: 'Appliances', 2: 'Furnitures', 3: 'IT service', 4: 'Accessories'}
data['merchant_segments'] = data['clusters'].map(cluster_mapping)



In [24]:
data

Unnamed: 0_level_0,name,tags,tags_clean,clusters,kmode_clusters
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10023283211,Felis Limited,"((furniture, home furnishings and equipment sh...",furniture home furnishing equipment shop manuf...,2,0
10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",cable satellite pay television radio service,1,0
10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]",jewelry watch clock silverware shop,4,0
10187291046,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops]",watch clock jewelry repair shop,4,0
10192359162,Enim Condimentum PC,"([music shops - musical instruments, pianos, a...",music shop musical instrument piano sheet music,4,0
...,...,...,...,...,...
99938978285,Elit Dictum Eu Ltd,"[(opticians, optical goods, and eyeglasses)",optician optical good eyeglass,1,0
99974311662,Mollis LLP,"((books, periodicals, and newspapers)",book periodical newspaper,1,0
99976658299,Sociosqu Corp.,((shoe shops),shoe shop,4,0
99987905597,Commodo Hendrerit LLC,[[motor vehicle Supplies and new parts],motor vehicle supply new part,1,0
