In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [2]:
data = pd.read_parquet("../data/tables/tbl_merchants.parquet")

# Remove revenue level and take rate
data['tags']= data['tags'].str[0:-25]

In [3]:
STOPWORDS = set(stopwords.words('english'))

# Function for preprocessing tags text
def preprocess_text(x):
    
    # Lower case and remove all punctutations
    x = x.lower()
    x = x.translate(str.maketrans('', '', string.punctuation))

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    x = " ".join(lemmatizer.lemmatize(word) for word in x.split())

    # Remove stop words
    x =  ' '.join([word for word in x.split() if word not in STOPWORDS])

    return x

In [4]:
data['tags_clean'] = data['tags'].map(preprocess_text)

In [5]:
# One Hot Encode tags
count_vectorizer = CountVectorizer(binary=True)
datavec  = count_vectorizer.fit_transform(data['tags_clean'])
count_array = datavec.toarray()
df = pd.DataFrame(data=count_array,columns = count_vectorizer.get_feature_names())
df



Unnamed: 0,al,antique,appliance,art,artist,awning,beauty,bicycle,book,cable,...,supply,system,telecom,television,tent,tool,toy,vehicle,watch,writing
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4022,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4024,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [6]:
km = KMeans(n_clusters=3, random_state=0)
df_clust = km.fit_transform(df)

In [7]:
km.labels_

array([2, 1, 0, ..., 0, 1, 1], dtype=int32)