In [17]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import plotly.express as px
import collections

#Libraries for preprocessing
from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import webcolors

#Download once if using NLTK for preprocessing
import nltk
nltk.download('punkt')

#Libraries for vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from fuzzywuzzy import fuzz

#Libraries for clustering
from sklearn.cluster import KMeans

#Load data set
df = pd.read_csv('Products.csv', encoding= 'ANSI')
text1 = df.PRODUCTTYPE.unique()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fejzullin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
def stemSentence(sentence):
    porter = PorterStemmer()
    token_words = word_tokenize(sentence)
    stem_sentence = [porter.stem(word) for word in token_words]
    return ' '.join(stem_sentence)

text3 = pd.Series([stemSentence(x.replace('/', ' ')) for x in text1])

In [19]:
text3

0                  крем гель для душа
1            парфюмерная вода женская
2      парфюмированный скраб для тела
3       парфюмированный крем для тела
4          парфюмированный дезодорант
                    ...              
528                   масло для ванны
529      молочко для интимной гигиены
530                    маска защитная
531                  краска для волос
532            губка для душа и ванны
Length: 533, dtype: object

In [31]:
stop_w = ["для", "на", "и", "средство", "набор", "после"]

In [32]:
# vectorizer_ntf = TfidfVectorizer(analyzer='word',ngram_range=(1,2), stop_words = stop_w)
# X_ntf = vectorizer_ntf.fit_transform(text3)
vectorizer_hash = HashingVectorizer(analyzer='word',ngram_range=(1,2), stop_words = stop_w)
X_ntf = vectorizer_hash.fit_transform(text3)

In [33]:
# sse={}
# for k in np.arange(100,900,100):
#     kmeans = KMeans(n_clusters=k, verbose = True).fit(X_ntf)
#     sse[k] = kmeans.inertia_
# plt.plot(list(sse.keys()),list(sse.values()))
# plt.xlabel('Values for K')
# plt.ylabel('SSE')
# plt.show();

In [34]:
# plt.plot(list(sse.keys()),list(sse.values()))
# plt.xlabel('Values for K')
# plt.ylabel('SSE')
# plt.show();

In [35]:
kmeans = KMeans(n_clusters= 50, verbose = True)
kmeans.fit(X_ntf)


Initialization complete
Iteration 0, inertia 624.612869954549
Iteration 1, inertia 352.2707568638691
Iteration 2, inertia 347.7690801085768
Iteration 3, inertia 341.3389533352066
Iteration 4, inertia 339.4347232891601
Iteration 5, inertia 339.17382152314514
Converged at iteration 5: strict convergence.
Initialization complete
Iteration 0, inertia 622.4733982250544
Iteration 1, inertia 346.84931639838265
Iteration 2, inertia 339.52022739294426
Iteration 3, inertia 338.6765275305651
Iteration 4, inertia 338.36520928512465
Iteration 5, inertia 338.06898674203495
Converged at iteration 5: strict convergence.
Initialization complete
Iteration 0, inertia 622.6553732227023
Iteration 1, inertia 356.13624384488645
Iteration 2, inertia 345.00281309343734
Iteration 3, inertia 343.1942997968373
Iteration 4, inertia 342.5055664435472
Iteration 5, inertia 341.8343781342126
Iteration 6, inertia 341.4695991861369
Converged at iteration 6: strict convergence.
Initialization complete
Iteration 0, inerti

KMeans(n_clusters=50, verbose=True)

In [36]:
result = pd.DataFrame(text3)
result['cluster'] = kmeans.predict(X_ntf)

In [37]:
result = result.rename(columns={0:'PRODUCTTYPE'})
display(result.sort_values('cluster'))

Unnamed: 0,PRODUCTTYPE,cluster
140,кольцо для смешивания,0
220,набор подарочный,0
221,средство для шерсти,0
404,пакет,0
88,скребок для языка,0
...,...,...
496,глина для душа,48
332,парфюмированное молочко для душа,48
532,губка для душа и ванны,48
472,праймер для ресниц,49


In [38]:
result = result.sort_values('cluster', ascending=False).drop_duplicates('PRODUCTTYPE').sort_index()

In [39]:
result[['cluster', 'PRODUCTTYPE']].sort_values('cluster').to_excel('NewClusteringProducts.xlsx', index = False) 