In [None]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
import cv2

from timeit import default_timer
from tqdm import tqdm
import os

OUT_DIR = 'out'
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

def savefig(fname):
    plt.savefig(f'{OUT_DIR}/{fname}')
    plt.savefig(f'{OUT_DIR}/{fname}.svg')
    plt.show()

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

def plot_elbow(data, out_fname, n_clusters, n_rounds=10, n_init=10):

    score1 = []
    score2 = []
    times = []

    for cl in tqdm(range(2, n_clusters + 1)):
        t0 = default_timer()
        score1_ = []
        score2_ = []

        for _ in range(n_rounds):
            kmeans = KMeans(n_clusters=cl, n_init=n_init)
            kmeans.fit(data)
            score1_.append(calinski_harabasz_score(data, kmeans.labels_))
            score2_.append(davies_bouldin_score(data, kmeans.labels_))
        score1.append(np.mean(score1_))
        score2.append(np.mean(score2_))
        t1 = default_timer()
        elapsed = t1 - t0
        times.append(elapsed / n_rounds)

    plt.plot(np.arange(2, n_clusters + 1), score1, marker='o')
    plt.title('Calinski-Harabasz Index (higher is better)')
    plt.xlabel('cluster amount')
    plt.ylabel('index value')
    savefig(f'elbow_chart_1_{out_fname}')

    plt.plot(np.arange(2, n_clusters + 1), score2, marker='o')
    plt.title('Davies-Bouldin Index (lower is better)')
    plt.xlabel('cluster amount')
    plt.ylabel('index value')
    savefig(f'elbow_chart_2_{out_fname}')

    plt.plot(np.arange(2, n_clusters + 1), times, marker='o')
    plt.xlabel('cluster amount')
    plt.ylabel('computation time [s]')
    savefig(f'elbow_times_{out_fname}')


#### Dummy data

In [None]:
def read_2d_input(filename):
    img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
    xx, yy = np.meshgrid(np.arange(img.shape[1]), np.flip(np.arange(img.shape[0])))
    table = np.vstack([img.ravel(), xx.ravel(), yy.ravel()]).T
    return np.array([(x, y) for c, x, y in table if c == 0])


data_simple = read_2d_input('data_simple.png')
plt.scatter(*data_simple.T)
plt.show()

#### Dummy data clustering

In [None]:
plot_elbow(data_simple, 'simple', 20);

In [None]:
fig, ax = plt.subplots(2, 2)
fig.tight_layout()
for i in range(4):
    cl = 10 + i
    kmeans = KMeans(n_clusters=cl, n_init=10)
    kmeans.fit(data_simple)
    cluster_labels = kmeans.labels_
    ax0 = ax[i // 2, i % 2]
    ax0.set_title(f'n_clusters = {cl}')
    ax0.set_xticks([], [])
    ax0.set_yticks([], [])
    ax0.scatter(*data_simple.T, c=cluster_labels, cmap='Paired', s=5)
    ax0.scatter(*kmeans.cluster_centers_.T, marker='h', c='black', s=30)
savefig('clusters_dummy_data')

### Loading dataset

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

data = pd.read_csv('players_22.csv')

data_cleaned = data.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
data_cleaned = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(data_cleaned)
data_cleaned = StandardScaler().fit_transform(data_cleaned)

In [None]:
plot_elbow(data_cleaned, '', 30);

In [None]:
from scipy.spatial.distance import cdist


def find_centers(model):
    X = pd.DataFrame(data_cleaned)
    min_dist = np.min(cdist(X, model.cluster_centers_, 'euclidean'), axis=1, )
    Y = pd.DataFrame(min_dist, index=X.index, columns=['Centered_euclidean_dist'])
    Z = pd.DataFrame(model.labels_, index=X.index, columns=['cluster_ID'])
    PAP = pd.concat([Y, Z], axis=1)
    idxs = np.array(PAP.groupby(['cluster_ID']).idxmin().values.tolist()).squeeze()
    names = [data['long_name'][i] for i in idxs]
    return names

In [None]:
def clusters_overall(labels, fname):
    df = pd.DataFrame(data)
    clusters = df.groupby(labels)
    overall = clusters['overall'].describe()
    overall = overall.round(2)
    overall = overall.astype({'count': 'int32'})
    overall.to_csv(f'{OUT_DIR}/{fname}.csv', index_label='cluster')
    return overall

In [None]:
kmeans = KMeans(11, n_init=100)
kmeans.fit(data_cleaned)

print(find_centers(kmeans))
print(clusters_overall(kmeans.labels_, 'overall_1'))

### Adaptive clustering

In [None]:
from scipy.stats import shapiro


def adaptive_clustering(data, n_init=10, min_cluster_size=5, alpha=0.05):
    data_labels = ['-2137' for _ in range(len(data))]

    def _adaptive_clustering(cluster, indexes, label_prefix):
        if len(cluster) < min_cluster_size:
            for idx in indexes:
                data_labels[idx] = f'{label_prefix}0'
            return

        kmeans = KMeans(n_clusters=2, n_init=n_init)
        kmeans.fit(cluster)

        p1, p2 = kmeans.cluster_centers_[0], kmeans.cluster_centers_[1]
        l2 = np.sum((p1 - p2) ** 2)
        if l2 == 0:
            return
        line_projection = []
        for p3 in cluster:
            t = np.sum((p3 - p1) * (p2 - p1)) / l2
            line_projection.append(p1 + t * (p2 - p1))

        stat, p = shapiro(line_projection)
        if p > alpha:
            for idx, label in zip(indexes, kmeans.labels_):
                data_labels[idx] = f'{label_prefix}{label}'
        else:
            cl1, cl2, idx1, idx2 = [], [], [], []
            for idx, sample, label in zip(indexes, cluster, kmeans.labels_):
                if label == 0:
                    idx1.append(idx)
                    cl1.append(sample)
                else:
                    idx2.append(idx)
                    cl2.append(sample)
            _adaptive_clustering(cl1, idx1, f'{label_prefix}0')
            _adaptive_clustering(cl2, idx2, f'{label_prefix}1')

    _adaptive_clustering(data, np.arange(len(data)), '')

    label_dict = dict()
    new_label = 0
    for label in data_labels:
        if label not in label_dict:
            label_dict[label] = new_label
            new_label += 1

    labels = [label_dict[data_labels[idx]] for idx in range(len(data))]

    return labels


In [None]:
min_cluster_size = 55
alpha=1e-7
labels = adaptive_clustering(data_simple, min_cluster_size=min_cluster_size, alpha=alpha)
print(len(np.unique(labels)))
plt.scatter(*data_simple.T, c=labels, cmap='Paired')
plt.title(f'min_cluster_size={min_cluster_size}, alpha={alpha}')
savefig('adaptive_clustering')

In [None]:
labels = adaptive_clustering(data_cleaned, n_init=100, min_cluster_size=1000, alpha=1e-7)

In [None]:
print(clusters_overall(labels, 'overall_2'))

In [None]:
score1 = calinski_harabasz_score(data_cleaned, kmeans.labels_)
score2 = davies_bouldin_score(data_cleaned, kmeans.labels_)
print(f'{score1:.2f}')
print(f'{score2:.2f}')