In [1]:
import string
import collections
import re
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, IncrementalPCA
from sklearn.cluster import KMeans

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
all_df = pd.read_csv('dataset.csv', index_col=0)

In [3]:
all_df.head()

Unnamed: 0,Review Text
0,Не скачивается стала обновлять и зависло прило...
1,Приложение не открывается после обновления Уже...
2,Добрый день после последнего обновления пропал...
3,Не могу скачать так как требует подключения Wifi
4,Почему то не могу оплатить при минусе на балан...


In [4]:
def process_text(text, stem=True):
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens

def cluster_texts(texts, clusters=3):
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('russian'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [None]:
clusters = cluster_texts(all_df['Review Text'].to_numpy(), 6)
clusters.keys()

In [None]:
for index, entity in enumerate(all_df.values):
    for cluster in clusters.keys():
        if index in clusters[cluster]:
            print('{0},{1},{2}'.format(index, entity[0], cluster))

In [5]:
vec = TfidfVectorizer()
vec.fit(all_df['Review Text'].values)
features = vec.transform(all_df['Review Text'].values)

In [6]:
cls = MiniBatchKMeans(n_clusters=6, random_state=0)
cls.fit(features)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=6, n_init=3, random_state=0, reassignment_ratio=0.01,
                tol=0.0, verbose=0)

In [7]:
cls.predict(features)

array([2, 2, 2, ..., 1, 2, 2])

In [12]:
ipca = IncrementalPCA(n_components=5,copy=False,
    batch_size=(all_df.shape[0] // 100))
reduced_features = ipca.fit_transform(features.toarray())

MemoryError: 

In [None]:
reduced_cluster_centers = ipca.transform(cls.cluster_centers_)

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')