In [1]:
import pandas as pd
import numpy as np
from classes.Breakdown import df_transform
from classes.UniVectorizer import Vectorizer
from classes.MvideoPreprocessor import TextPreprocessor
# from Preprocessing import df_transform, EM_Pomoshnik_TextPreprocessor, TextPreprocessor

# Препроцессинг

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
sw = stopwords.words('Russian')
sw += ['ответ', 'подходит', 'спасибо', 'здравствуйте', 'добрый', 'день', 'こんにちは4464', 'мочь', 'подсказать', 'привет', '⅞4467', 'заказ',
       'товар', 'клиент', 'жанар', 'жанна', 'жарков', 'жать', 'могу', 'создать', 'делать', 'сделать', 'который', 'вопрос']

df = df_transform(pd.read_excel('data/empom_autofaq.xlsx'))
# mtp = EM_Pomoshnik_TextPreprocessor(' ', ' ', ' ')
tp = TextPreprocessor(method='lemma', stop_words=sw)
df['line'] = tp.fit_transform(df['line'])

vect = Vectorizer('tfidf', max_df=0.9, min_df=0.01, ngram_range=(1, 2))
X = vect.fit_transform(df['line'])
user_messages = X[df['sender'] == 'user']

In [None]:
df['line'].head()

In [None]:
!pip install kneed

# Сколько кластеров?
(для оценки использовал инерцию от KMeans)

In [5]:
!pip install tqdm



In [None]:
from sklearn.cluster import KMeans
from kneed import KneeLocator
import matplotlib.pyplot as plt
from tqdm import tqdm

K = range(1, 50)
inertia = []
for k in tqdm(K):
    mb_kmeans = KMeans(n_clusters = k)
    mb_kmeans.fit(user_messages)
    inertia.append(mb_kmeans.inertia_)

i = np.arange(len(inertia))
knee = KneeLocator(i, inertia, S=1, curve='convex', direction='decreasing', interp_method='polynomial')


fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.title('Elbow Method')
plt.xlabel('cluster numbers')
plt.ylabel("Inertia")
plt.show()

print(f"Optimal number of clusters: {knee.knee}")

# Function for topic-definition (LDA)

In [7]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics import silhouette_score

def get_topics(data, labels, terms):
    lda = LDA(n_components=1)
    for c in range(max(labels) + 1):
        lda.fit(data[labels == c])

        word_list = []
        print(f"Topic {c}:")
        for j in lda.components_.argsort()[0,-16:-1]:
            word_list.append(terms[j])
        print(word_list)

k = knee.knee

# MiniBatch KMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans

mb_kmeans = MiniBatchKMeans(n_clusters=k)
mb_kmeans.fit(user_messages)

print(f"Silhouette for MiniBatch KMeans: {silhouette_score(user_messages, mb_kmeans.labels_)}")
get_topics(user_messages, mb_kmeans.labels_, vect.get_feature_names())

# Regular KMeans

In [None]:
from sklearn.cluster import KMeans

# k = 15
kmeans = KMeans(n_clusters=k)
kmeans.fit(user_messages)

print(f"Silhouette for Regular KMeans: {silhouette_score(user_messages, kmeans.labels_)}")
get_topics(user_messages, kmeans.labels_, vect.get_feature_names())

# DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

for e in range(70, 81, 1):
    db = DBSCAN(eps=e/100).fit(user_messages)
    clusters = dict().fromkeys(db.labels_, 0)
    for l in db.labels_:
        clusters[l] += 1
    print(f"Eps: {e/100}\t|{clusters}")

In [None]:
print(f"Silhouette for DBSCAN: {silhouette_score(user_messages, db.labels_)}")
get_topics(user_messages, db.labels_, vect.get_feature_names())

# Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation as LDA

agg_res = AgglomerativeClustering(n_clusters=k)
agg_res.fit(user_messages.toarray())

print(f"Silhouette score for Agglomerative: {silhouette_score(user_messages, agg_res.labels_)}")
get_topics(user_messages, agg_res.labels_, vect.get_feature_names())

# BIRCH

In [None]:
from sklearn.cluster import Birch
birch = Birch(n_clusters=k)
birch.fit(user_messages)

print(f"Silhouette score for BIRCH: {silhouette_score(user_messages, birch.labels_)}")
get_topics(user_messages, birch.labels_, vect.get_feature_names())

# Visualizing clusters with PyLDAvis

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis
from kmeans_to_pyLDAvis.kmeans_to_pyLDAvis import kmeans_to_prepared_data

prep = kmeans_to_prepared_data(user_messages, vect.get_feature_names(), kmeans.cluster_centers_,
                               kmeans.labels_, embedding_method='tsne')

pyLDAvis.display(prep)

In [17]:
with open("kmeans_vis.html", "w") as f:
    pyLDAvis.save_html(prep, f)