In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster import hierarchy
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [None]:
%matplotlib inline


In [None]:
sns.set_theme()
plt.rcParams['figure.figsize'] = (9, 5)

In [None]:
df: pd.DataFrame = pd.read_csv('data/beverage_r.csv', sep=';')
df = df.drop(columns=['numb.obs']).rename(columns={'SEVENUP': '7UP'})
df.head()

In [None]:
sns.histplot(
    data=df.melt(var_name='drink', value_name='no/yes'),
    x='no/yes',
    hue='drink',
    multiple='stack'
);

In [None]:
link = hierarchy.linkage(df, 'ward', 'euclidean')

In [None]:
_, ax = plt.subplots(figsize=(9, 9))

dn = hierarchy.dendrogram(link, labels=df.index, orientation='right', ax=ax)
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('index')
plt.xlabel('distance (Ward)');

In [None]:
dist_rev = link[:, 2][::-1]

plt.plot(np.arange(1, len(dist_rev) + 1), dist_rev, marker='o')
plt.title('Distance between merged clusters')
plt.xlabel('step')
plt.ylabel('distance (Ward)')
plt.axhline(dist_rev[3], c='g', linestyle='dashed')
plt.text(
    len(link) - 3,
    dist_rev[3] + (dist_rev.max() - dist_rev.min()) / len(dist_rev),
    '3 clusters'
);

In [None]:
df['hierarchy_cluster'] = hierarchy.fcluster(link, 3, criterion='maxclust')

In [None]:
df.groupby('hierarchy_cluster').size()

In [None]:
stat = df.groupby('hierarchy_cluster').sum()

fig, axs = plt.subplots(3, 1, figsize=(10, 12))
fig.subplots_adjust(hspace=0.5)
for i, ax in enumerate(axs, start=1):
    ax.set_title(f'Cluster {i}', size=18)
    stat.loc[i, :].plot.bar(ax=ax, rot=30, yticks=None, color=sns.color_palette()[i])
    ax.yaxis.set_visible(False)

In [None]:
X = df.drop(columns=['hierarchy_cluster'])

In [None]:
n_clusters = np.arange(1, 11, dtype=int)
wcss = []  # внутрикластерные суммы квадратов
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(X)
    wcss.append(k_means.inertia_)

sns.lineplot(x=n_clusters, y=wcss)
plt.axvline(n_clusters[2], c='g', linestyle='dashed')
plt.text(
    n_clusters[2] + 0.3,
    np.max(wcss),
    '3 clusters'
);

In [None]:
k_means = KMeans(n_clusters=3, random_state=0)
y_means = k_means.fit_predict(X)

In [None]:
df['kmeans_cluster'] = y_means

In [None]:
df.groupby('kmeans_cluster').size()

In [None]:
stat = df.drop(columns=['hierarchy_cluster']).groupby('kmeans_cluster').sum()

fig, axs = plt.subplots(3, 1, figsize=(10, 12))
fig.subplots_adjust(hspace=0.5)
for i, ax in enumerate(axs):
    ax.set_title(f'Cluster {i}', size=18)
    stat.loc[i, :].plot.bar(ax=ax, rot=30, yticks=None, color=sns.color_palette()[i])
    ax.yaxis.set_visible(False)

In [None]:
param_grid = [{
    'n_clusters': np.arange(2, 6, dtype=int)
}]

search = GridSearchCV(
    KMeans(random_state=0),
    param_grid,
    scoring={
        'silhouette_score': metrics.make_scorer(metrics.silhouette_score),
        'calinski_harabasz_score': metrics.make_scorer(metrics.calinski_harabasz_score),
    },
    refit='silhouette_score',
    cv=[(slice(None), slice(None))],  # отключаем кросс-валидацию
    n_jobs=4,
)

In [None]:
search.fit(X, X);

In [None]:
score_1 = 'mean_test_calinski_harabasz_score'
score_2 = 'mean_test_silhouette_score'

report = pd.DataFrame(search.cv_results_)
report = report[['param_n_clusters', score_1, score_2, 'mean_fit_time', 'mean_score_time']]
report

In [None]:
k_means = KMeans(n_clusters=2, random_state=0)
y_means = k_means.fit_predict(X)

In [None]:
df['kmeans_optimum_cluster'] = y_means

In [None]:
df.groupby('kmeans_optimum_cluster').size()

In [None]:
stat = df.drop(columns=['hierarchy_cluster', 'kmeans_cluster']).groupby('kmeans_optimum_cluster').sum()

fig, axs = plt.subplots(2, 1, figsize=(10, 8))
fig.subplots_adjust(hspace=0.5)
for i, ax in enumerate(axs):
    ax.set_title(f'Cluster {i}', size=18)
    stat.loc[i, :].plot.bar(ax=ax, rot=30, yticks=None, color=sns.color_palette()[i])
    ax.yaxis.set_visible(False)

In [None]:
df: pd.DataFrame = pd.read_csv('data/assess.dat', sep='\t')
df = df.drop(columns=['NR'])
X = df[[f'T{i}' for i in range(1, 11)]]
X.head()

In [None]:
cols_map = {
    'T1': 'память на числа',
    'T2': 'умение решать математические задачи',
    'T3': 'находчивость на прямом диалоге',
    'T4': 'умение составлять алгоритмы',
    'T5': 'уверенность во время выступления',
    'T6': 'командный дух',
    'T7': 'находчивость',
    'T8': 'сотрудничество',
    'T9': 'признание в коллективе',
    'T10': 'сила убеждения',
}

In [None]:
link = hierarchy.linkage(X, 'ward', 'euclidean')

In [None]:
_, ax = plt.subplots(figsize=(9, 9))

dn = hierarchy.dendrogram(link, labels=X.index, orientation='right', ax=ax)
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('index')
plt.xlabel('distance (Ward)');

In [None]:
dist_rev = link[:, 2][::-1]

plt.plot(np.arange(1, len(dist_rev) + 1), dist_rev, marker='o')
plt.title('Distance between merged clusters')
plt.xlabel('step')
plt.ylabel('distance (Ward)')
plt.axhline(dist_rev[2], c='orange', linestyle='dashed')
plt.axhline(dist_rev[3], c='g', linestyle='dashed')
plt.text(
    len(link) - 3,
    dist_rev[2] + (dist_rev.max() - dist_rev.min()) / len(dist_rev),
    '3 clusters'
)
plt.text(
    len(link) - 3,
    dist_rev[3] + (dist_rev.max() - dist_rev.min()) / len(dist_rev),
    '4 clusters'
);

In [None]:
X['hierarchy_cluster_3'] = hierarchy.fcluster(link, 3, criterion='maxclust')

In [None]:
X.groupby('hierarchy_cluster_3').size()

In [None]:
stat = X.groupby('hierarchy_cluster_3').sum().rename(columns=cols_map)

fig, axs = plt.subplots(3, 1, figsize=(10, 18))
fig.subplots_adjust(hspace=0.7)
for i, ax in enumerate(axs, start=1):
    ax.set_title(f'Cluster {i}', size=18)
    stat.loc[i, :].plot.bar(ax=ax, rot=30, yticks=None, color=sns.color_palette()[i])
    ax.yaxis.set_visible(False)

In [None]:
X = X.drop(columns=['hierarchy_cluster_3'])

In [None]:
n_clusters = np.arange(1, 11, dtype=int)
wcss = []  # внутрикластерные суммы квадратов
for n in n_clusters:
    k_means = KMeans(n_clusters=n, random_state=0)
    k_means.fit(X)
    wcss.append(k_means.inertia_)

sns.lineplot(x=n_clusters, y=wcss)
plt.axvline(n_clusters[3], c='g', linestyle='dashed')
plt.text(
    n_clusters[3] + 0.3,
    np.max(wcss),
    '4 clusters'
);

In [None]:
k_means = KMeans(n_clusters=4, random_state=0)
y_means = k_means.fit_predict(X)

In [None]:
X['kmeans_cluster_4'] = y_means

In [None]:
X.groupby('kmeans_cluster_4').size()

In [None]:
stat = X.groupby('kmeans_cluster_4').sum().rename(columns=cols_map)

fig, axs = plt.subplots(4, 1, figsize=(10, 18))
fig.subplots_adjust(hspace=0.7)
for i, ax in enumerate(axs):
    ax.set_title(f'Cluster {i}', size=18)
    stat.loc[i, :].plot.bar(ax=ax, rot=30, yticks=None, color=sns.color_palette()[i])
    ax.yaxis.set_visible(False)

In [None]:
param_grid = [{
    'n_clusters': [3, 4]
}]

search = GridSearchCV(
    KMeans(random_state=0),
    param_grid,
    scoring={
        'silhouette_score': metrics.make_scorer(metrics.silhouette_score),
        'calinski_harabasz_score': metrics.make_scorer(metrics.calinski_harabasz_score),
    },
    refit='silhouette_score',
    cv=[(slice(None), slice(None))],  # отключаем кросс-валидацию
    n_jobs=4,
)

In [None]:
X = X.drop(columns=['kmeans_cluster_4'])

In [None]:
search.fit(X, X);

In [None]:
score_1 = 'mean_test_calinski_harabasz_score'
score_2 = 'mean_test_silhouette_score'

In [None]:
report = pd.DataFrame(search.cv_results_)
report = report[['param_n_clusters', score_1, score_2, 'mean_fit_time', 'mean_score_time']]
report