# Обучение модели

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np
np.set_printoptions(precision=2, suppress=True)
from scipy.sparse import csr_matrix, hstack
from scipy.spatial.distance import pdist, squareform
import re

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, homogeneity_completeness_v_measure, silhouette_score, silhouette_samples
from sklearn.preprocessing import LabelEncoder

import itertools
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from IPython.html import widgets 
from IPython.display import display, clear_output, HTML
from ipywidgets import HBox, VBox, Layout, Output

from Preprocessing.text import Text, StopWords
stop_words_file = r"Preprocessing/Stop-words.txt"
stop_words = StopWords(stop_words_file)

from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import tabulate

from Configs.MNIST_Fashion.configs import DATA_DIR, LOG_DIR
from Configs.MNIST_Fashion import mnist_reader
from Configs.MNIST_Fashion.helper import get_sprite_image


ModuleNotFoundError: No module named 'Preprocessing'

In [None]:
print(DATA_DIR)
print(LOG_DIR)

In [None]:
def get_data():
    X, Y = mnist_reader.load_mnist(path=DATA_DIR, kind='t10k')
    
    labels = ['t_shirt_top', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle_boots']
    Y_str = np.array([labels[j] for j in Y])
    
    columns = ['X', 'Y', 'Y_str']
    df = pd.DataFrame(columns=columns)
    df_tmp = pd.DataFrame(columns=columns)
    
    df_tmp['X'] = X[:].tolist()
    df_tmp['Y'] = Y[:]
    df_tmp['Y_str'] = Y_str[:]
    
    df_tmp.sort_values(['Y'], ascending=[True], inplace=True, axis=0)
    labels_ids = df_tmp['Y'].unique()
    line_index = 0
    
    for cur_id in sorted(labels_ids):
        # print("id: {0}\tname:{1}".format(cur_id, labels[cur_id]))
        for cur_line in df_tmp[df_tmp['Y'] == cur_id][:100].values:
            df.loc[line_index] = [cur_line[i] for i in range(3)]
            line_index += 1

    # np.savetxt(FLAGS.data_dir + '/Xtest.tsv', X, fmt='%.6e', delimiter='\t')
    # plt.imsave(FLAGS.data_dir + '/mnist-fashion-sprite.png', get_sprite_image(X), cmap='gray')

    X = np.array([line for line in df['X'].values])
    Y = np.array([line for line in df['Y'].values])
    Y_str = np.array([line for line in df['Y_str'].values])

    plt.imsave(DATA_DIR + '/mnist-fashion-sprite.png', get_sprite_image(X), cmap='gray')
    return X, Y, Y_str, df

# Загрузка данных
X, Y, Y_str, df = get_data()
df.head()


In [None]:
Y_str.shape

In [None]:
classes = sorted(list(set(Y)))
classes_str = sorted(list(set(Y_str)))

In [None]:
classes_str

# 5. Кластерный анализ

In [None]:
# cluster_x = np.concatenate((doc2vec, word2vec), axis=1)
# cluster_x.shape

cluster_x = X
cluster_x

In [None]:
kmeans = KMeans(n_clusters=len(classes), random_state=0).fit(cluster_x)
kmeans_y = kmeans.predict(cluster_x)

In [None]:
kmeans_y.shape

In [None]:
cluster_df = df.copy()
cluster_df["kmeans"] = kmeans_y

In [None]:
db = DBSCAN(eps=0.5, min_samples=10).fit(cluster_x)
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters

In [None]:
set(labels)

In [None]:
cluster_df["dbscan"] = labels
cluster_df.head()

In [None]:
metric = "cosine"
dist_matrix = pdist(cluster_x, metric=metric)

In [None]:
dist_matrix

In [None]:
Z = linkage(dist_matrix, 'ward')
Z

In [None]:
fig = plt.figure(figsize=(30, 30))
dn = dendrogram(Z, color_threshold=3, labels=[""]*(Z.shape[0] + 1), distance_sort="ascending", truncate_mode="none")

In [None]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html
hierarch_1 = fcluster(Z, 2.4, criterion="distance")
hierarch_1

In [None]:
len(set(hierarch_1))

In [None]:
hierarch_2 = fcluster(Z, 45, criterion="maxclust")
hierarch_2

In [None]:
len(set(hierarch_2))

In [None]:
cluster_df["distance"] = hierarch_1
cluster_df["maxclust"] = hierarch_2

In [None]:
cluster_df.head()

In [None]:
# df[cluster_df[cluster_type.value] == cluster_name]
print(df.shape)
print(cluster_df.shape)
cluster_df

In [None]:
# column_name = 'cservice'
column_name = 'Y_str'

In [None]:
cluster_types = ["kmeans", "dbscan", "distance", "maxclust"]

def select_cluster_type(cluster_type):
    with out:
        clear_output()

    clusters = sorted(set(cluster_df[cluster_type]))
    clusters_list.options = clusters
    select_cluster(clusters[0])

    
def select_cluster(cluster_name):
    with out:
        clear_output()
        
        value_counts = df[cluster_df[cluster_type.value] == cluster_name][column_name].value_counts()
        summa = sum(value_counts)
        
        table = []
        headers = ["Наименование категории", "Количество", "Доля в текущем кластере", "Доля от всех заявок данной категории"]
        for elem in value_counts.iteritems():
            count = len(df[df[column_name] == elem[0]])
            table.append([elem[0], elem[1], "{0:.3f}%".format(elem[1] / summa), "{0:.3f}%".format(elem[1] / count)])
        display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))
        

cluster_type = widgets.Select(options=cluster_types, description="Тип", rows=10)
clusters_list = widgets.Select(options=[], description="Кластер", rows=10)

i = widgets.interactive(select_cluster_type, cluster_type=cluster_type)
j = widgets.interactive(select_cluster, cluster_name=clusters_list)

out = Output()

VBox([HBox([i, j]), out])

In [None]:
le = LabelEncoder()
y_true = cluster_df[column_name]
y_true = le.fit_transform(y_true)
y_true

In [None]:
# k_means
dist_squarematrix = squareform(dist_matrix)
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_true, kmeans_y)
silhouette = silhouette_score(dist_squarematrix, kmeans_y, metric="precomputed")

In [None]:
homogeneity, completeness, v_measure, silhouette

In [None]:
# fcluster, criterion="distance"
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_true, hierarch_1)
silhouette = silhouette_score(dist_squarematrix, hierarch_1, metric="precomputed")

In [None]:
homogeneity, completeness, v_measure, silhouette

In [None]:
# fcluster, criterion="maxclust"
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_true, hierarch_2)
silhouette = silhouette_score(dist_squarematrix, hierarch_2, metric="precomputed")

In [None]:
homogeneity, completeness, v_measure, silhouette

In [None]:
range_n_clusters = [2, 5, 10, 20]
X = cluster_x
y = y_true

for n_clusters in range_n_clusters:
    fig, ax = plt.subplots()
    fig.set_size_inches(18, 7)

    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

In [None]:
range_t = [1.5, 2, 2.5, 3, 3.5]
X = cluster_x
y = y_true

for t in range_t:
    fig, ax = plt.subplots()
    fig.set_size_inches(18, 7)

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    cluster_labels = fcluster(Z, t, criterion="distance")
    n_clusters = len(set(cluster_labels))
    
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()