In [0]:
from google.colab import drive 
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
import pandas as pd
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, DBSCAN, \
                            KMeans, MiniBatchKMeans, Birch, MeanShift, SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, \
                            silhouette_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [0]:
data = pd.read_csv('/gdrive/My Drive/clustering/data.csv')

In [0]:
data = data[['category_name', 'title']]

In [0]:
data.head(5)

Unnamed: 0,category_name,title
0,Детская одежда и обувь,Платье праздничное для девочки
1,Мебель и интерьер,Кровать двухспальная
2,Телефоны,Samsung S3 mini gt-i8190
3,"Одежда, обувь, аксессуары","Красивое платье на одно плечо, в идеале размер м"
4,Детская одежда и обувь,Стильная панама


### Задание 1
На нескольких алгоритмах кластеризации, умеющих работать с sparse матрицами, проверьте, что работает лучше Count_Vectorizer или TfidfVectorizer (попробуйте выжать максимум из каждого - попробуйте нграммы, символьные нграммы, разные значения max_features и min_df) 

### KMeans
CountVectorizer

In [0]:
from sklearn.cluster import KMeans

In [0]:
sample = data.sample(frac=0.1)

In [0]:
cv = CountVectorizer(max_features=300, ngram_range=(1, 5), max_df=0.2)
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = KMeans(n_clusters=47)
cluster.fit(X_svd)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=47, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.360


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.275
Completeness: 0.364
V-measure: 0.313
Adjusted Rand Index: -0.012
Adjusted Mutual Information: 0.302


TfidfVectorizer

In [0]:
tfidf = TfidfVectorizer(max_features=300, ngram_range=(1, 5), max_df=0.1)
svd = TruncatedSVD(50)
X = tfidf.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = KMeans(n_clusters=47)
cluster.fit(X_svd)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=47, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.406


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))  

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.309
Completeness: 0.372
V-measure: 0.337
Adjusted Rand Index: -0.002
Adjusted Mutual Information: 0.327


В Kmeans при одних и тех же подобранных оптимальных параметрах tfidfvectorizer показывает себя лучше, чем countvectorizer. V-measure при tfidf векторизаторе больше, также как и  Silhouette Coefficient. 

### MiniBatchKMeans

CountVectorizer


In [0]:
cv = CountVectorizer(max_features=500, ngram_range=(1, 5), analyzer='word', max_df=0.2)
svd = TruncatedSVD(200)
X = cv.fit_transform(data['title'])
X_svd = svd.fit_transform(X)

y = data['category_name']

In [0]:
cluster = MiniBatchKMeans(n_clusters=1000, init_size=5000, verbose=1, max_iter=5000, 
                          max_no_improvement=100, reassignment_ratio=0.3)
cluster.fit(X_svd)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.590


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.486
Completeness: 0.304
V-measure: 0.374
Adjusted Rand Index: -0.011
Adjusted Mutual Information: 0.362


TfidfVectorizer

In [0]:
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 5), analyzer='word', max_df=0.2)
svd = TruncatedSVD(200)
X = tfidf.fit_transform(data['title'])
X_svd = svd.fit_transform(X)

y = data['category_name']

In [0]:
cluster = MiniBatchKMeans(n_clusters=1000, init_size=5000, verbose=1, max_iter=5000, 
                          max_no_improvement=100, reassignment_ratio=0.3)
cluster.fit(X_svd)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.356


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.509
Completeness: 0.310
V-measure: 0.385
Adjusted Rand Index: -0.012
Adjusted Mutual Information: 0.373


В данном случае tfidfvectorizer показал себя снова немного лучше. 

### Задание 2 

На нескольких алгоритмах кластеризации проверьте, какое матричное разложение (TruncatedSVD или NMF) работает лучше для кластеризации.

TruncatedSVD 

1. KMeans

In [0]:
sample = data.sample(frac=0.1)
cv = CountVectorizer(max_features=300, ngram_range=(1, 3), max_df=0.2)
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = KMeans(n_clusters=47)
cluster.fit(X_svd)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.357


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.279
Completeness: 0.370
V-measure: 0.318
Adjusted Rand Index: -0.012
Adjusted Mutual Information: 0.307


2. MiniBatchKMeans

In [0]:
sample = data.sample(frac=0.1)
cv = CountVectorizer(max_features=500, ngram_range=(1, 3))
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = MiniBatchKMeans(n_clusters=1000, init_size=5000, verbose=1, max_iter=5000, 
                          max_no_improvement=100, reassignment_ratio=0.3)
cluster.fit(X_svd)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.307


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.501
Completeness: 0.303
V-measure: 0.377
Adjusted Rand Index: -0.014
Adjusted Mutual Information: 0.302


3. AgglomerativeClustering

In [0]:
sample = data.sample(frac=0.05)
cv = CountVectorizer(max_features=500)
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = AgglomerativeClustering(n_clusters=170)
cluster.fit(X_svd)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

Silhouette Coefficient: 0.260


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.369
Completeness: 0.358
V-measure: 0.364
Adjusted Rand Index: -0.007
Adjusted Mutual Information: 0.319


NMF

1. KMeans

In [0]:
sample = data.sample(frac=0.1)
cv = CountVectorizer(max_features=300, ngram_range=(1, 3), max_df=0.2)
nmf = NMF(50)
X = cv.fit_transform(sample['title'])
X_nmf = nmf.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = KMeans(n_clusters=47)
cluster.fit(X_nmf)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.384


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.266
Completeness: 0.351
V-measure: 0.302
Adjusted Rand Index: -0.013
Adjusted Mutual Information: 0.291


2. MiniBatchKMeans

In [0]:
sample = data.sample(frac=0.05)
cv = CountVectorizer(max_features=500)
nmf = NMF(50)
X = cv.fit_transform(sample['title'])
X_nmf = nmf.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = MiniBatchKMeans(n_clusters=1000, init_size=5000, verbose=1, max_iter=5000, 
                          max_no_improvement=100, reassignment_ratio=0.3)
cluster.fit(X_nmf)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X[:10000], labels[:10000]))

Silhouette Coefficient: 0.506


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.565
Completeness: 0.317
V-measure: 0.406
Adjusted Rand Index: -0.007
Adjusted Mutual Information: 0.282


3. AgglomerativeClustering

In [0]:
sample = data.sample(frac=0.05)
cv = CountVectorizer(max_features=500, ngram_range=(1,3))
nmf = NMF(50)
X = cv.fit_transform(sample['title'])
X_nmf = nmf.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = AgglomerativeClustering(n_clusters=170)
cluster.fit(X_nmf)
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

Silhouette Coefficient: 0.323


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.316
Completeness: 0.348
V-measure: 0.331
Adjusted Rand Index: -0.018
Adjusted Mutual Information: 0.285


Вывод: в целом TruncatedSVD и NMF показывают схожие параметры. В 2 из 3 случаев V-measure у TruncatedSVD был выше, однако NMF показал самый высокий результат в V-measure вообще. 

### Задание 3 

С помощью алгоритмов, умеющих выделять выбросы, попробуйте найти необычные объявления (необычные - это такие, которые непонятно к какой категории можно вообще отнести, что-то с ошибками или вообще какая-то дичь). В этом задании можно использовать любую векторизацию. 

In [0]:
sample = data.sample(frac=0.01)

In [0]:
cv = CountVectorizer(max_features=2000, ngram_range=(1, 3))
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
import sklearn
sklearn.cluster.estimate_bandwidth(X_svd)

0.7443296061117501

In [0]:
cluster = MeanShift(cluster_all=False, bandwidth=0.7)
cluster.fit(X_svd)

MeanShift(bandwidth=0.7, bin_seeding=False, cluster_all=False, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

Silhouette Coefficient: 0.076


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.321
Completeness: 0.366
V-measure: 0.342
Adjusted Rand Index: -0.011
Adjusted Mutual Information: 0.209


In [0]:
sample['cluster'] = cluster.labels_

In [0]:
sample[sample.cluster==-1].head(30)

Unnamed: 0,category_name,title,cluster
101809,"Одежда, обувь, аксессуары",Юбка из фатина,-1
51965,Детская одежда и обувь,Новая Дубленка,-1
22001,Планшеты и электронные книги,Планшет samsung galaxy Tab 10.1,-1
189191,Товары для детей и игрушки,Новая музыкальная игрушка панда,-1
146176,Оборудование для бизнеса,Витрина из алюминиевого профиля,-1
137221,Детская одежда и обувь,Текстильная обувь (новая),-1
196223,Телефоны,Айфон 6 16 гб в отличном состояние,-1
246379,"Одежда, обувь, аксессуары",Блузка новая bestia,-1
108883,Ремонт и строительство,"Ог 30 гр 1220(19К60) -7,5-0,75-5ду-2300/2300-У",-1
64778,Предложение услуг,Сруб 6х6х2.4м из зимнего леса,-1


В данной кластеризации какие-то необычные выбросы мною были не обнаружены. 

DBSCAN

In [0]:
sample = data.sample(frac=0.05)

In [0]:
cv = CountVectorizer(max_features=2000)
svd = TruncatedSVD(50)
X = cv.fit_transform(sample['title'])
X_svd = svd.fit_transform(X)

y = sample['category_name']

In [0]:
cluster = DBSCAN(min_samples=10, eps=0.3) 
cluster.fit(X_svd)

DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [0]:
labels = cluster.labels_

In [0]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

Silhouette Coefficient: 0.062


In [0]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) 
print("Completeness: %0.3f" % metrics.completeness_score(y, labels)) 
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels)) 

print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(y, labels))

Homogeneity: 0.298
Completeness: 0.339
V-measure: 0.317
Adjusted Rand Index: -0.005
Adjusted Mutual Information: 0.282


In [0]:
sample['cluster'] = cluster.labels_

In [0]:
len(sample[sample.cluster==-1])

1275

In [0]:
sample[sample.cluster==-1][70:100]

Unnamed: 0,category_name,title,cluster
95350,Детская одежда и обувь,"Костюм зимний на мальчика ""Шалуны"" 92р",-1
168181,Детская одежда и обувь,Новый костюмчик на девочку,-1
208630,Детская одежда и обувь,Летняя обувь для мальчика 22 размера,-1
52836,Детская одежда и обувь,"Кроссовки Nike на девочку размер 22,5",-1
125262,Детская одежда и обувь,Костюм на девочку,-1
111365,Детская одежда и обувь,Комбинезон для мальчиков демисезонный,-1
152606,Детская одежда и обувь,Куртка весна-осень размер 120,-1
208292,Ремонт и строительство,"Ручки для ванны, новые, хром",-1
30771,Товары для детей и игрушки,Детский транспрот для дочки 3/1,-1
44439,"Одежда, обувь, аксессуары",Продаю костюм новый,-1


Выбросы DBSCAN также выглядят обычно. Возможно такой результат связан с необычностью самих данных. Какие-то определенные ошибки или непонятные объявления мною найдены не были. 