In [None]:
import os
import numpy as np
import pandas as pd
from sklearn import cluster
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore", UserWarning)

csv_files = []
for dirname, _, filenames in os.walk('../input/clustering-exercises'):
    for filename in sorted(filenames):
        csv_files.append(os.path.join(dirname, filename))

# Utils

In [None]:
def invert_rgb(h):
    rgb = [h[1:3], h[3:5], h[5:]]
    rgb = list(map(lambda x: int(255 - int(x, 16)), rgb))
    return '#' + ''.join(map('{:02x}'.format, rgb)).upper()

In [None]:
import matplotlib as mpl

def color_gradient(c1, c2, mix_rate=0.0):
    c1 = np.array(mpl.colors.to_rgb(c1))
    c2 = np.array(mpl.colors.to_rgb(c2))
    return mpl.colors.to_hex((1-mix_rate)*c1 + mix_rate*c2)

In [None]:
def create_n_colors(n):
    a = []
    for i in range(n):
        r = i / n
        c = ''
        if r < 1 / 3:
            c = color_gradient('red', 'blue', r * 3)
        elif r < 2 / 3:
            c = color_gradient('blue', 'green', (r - 1/3) * 3)
        else:
            c = color_gradient('green', 'red',  (r - 2/3) * 3)
        a.append(c)
    return a

In [None]:
def plot_df(df, colors, **kwargs):
    cmap = create_n_colors(len(set(colors)))
    c = list(map(lambda x: cmap[x], colors))
    return sns.scatterplot(data=df, x='x', y='y', c=c, **kwargs)

In [None]:
def sec(start_time, end_time):
    diff = end_time - start_time
    s, ms = diff.seconds, diff.microseconds
    return f'{s}.{str(ms)[:2]}'

In [None]:
import math
from datetime import datetime

def benchmark(files, suptitle, callback, figsize=(5, 5), **kwargs):
    COLS = 6
    ROWS = math.ceil(len(files) / COLS)
    ROWS += int(ROWS == 1)
    fig, axes = plt.subplots(ROWS, COLS, figsize=figsize)
    plt.suptitle(suptitle, fontsize=20)
    plt.tight_layout()
    plt.subplots_adjust(**kwargs)
    mean_times = []
    for i, csv in enumerate(files):
        r, c = i // COLS, i % COLS
        ts = datetime.now()
        items, tm = callback(pd.read_csv(csv), ax=axes[r][c])
        if type(tm) == type(None):
            tm = sec(ts, datetime.now())
        else:
            tm = sec(tm[0], tm[1])
        title = axes[r][c].get_title()
        axes[r][c].set_title(title+'\n'+f'{items} items ({tm}sec)')
        mean_times.append(items / float(tm))
    print(suptitle, 'process average {:.2f} item(s)/sec'.format(np.mean(mean_times)))
    for ax in axes.flatten():
        ax.axis('off')
    fig.show()

# Preview a sample

In [None]:
df = pd.read_csv(csv_files[0])

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for i, n_cluster in enumerate([2, 3, 4, 5]):
    kmeans = cluster.KMeans(n_clusters=n_cluster).fit(df[['x', 'y']])
    axes[i//2][i%2].set_title(f'KMeans(n_clusters={n_cluster})')
    plot_df(df, kmeans.labels_, ax=axes[i//2][i%2])
fig.show()

# Labels on Dataset

In [None]:
def label_answer(df, ax):
    labels = df['color']
    answer = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {answer}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'Answer Labels', label_answer, figsize=(14, 16), wspace=0.2, hspace=0.4, top=0.9)

# AffinityPropagation

This takes too looooong time, so removed from this kernel.

In [None]:
# %%time

# def affinity(df, ax):
# #     df = df.sample(frac=0.5)
#     answer = len(df['color'].unique())
#     labels = cluster.AffinityPropagation().fit_predict(df[['x', 'y']])
#     label_count = len(set(labels))
#     plot_df(df, labels, ax=ax, s=5)
#     info = f'cluster: {label_count}'
#     ax.set_title(info)
#     return len(df), None

# benchmark(csv_files, 'Affinity Propagation', affinity, figsize=(14, 16), wspace=0.2, hspace=0.4, top=0.9)

# K-Means

In [None]:
%%time

def kmeans(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    labels = cluster.KMeans(n_clusters=answer).fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'K Means', kmeans, figsize=(14, 16), wspace=0.2, hspace=0.4, top=0.9)

# DBSCAN (Density-Based Spatial Clustering of Applications with Noise)

In [None]:
%%time

def dbscan(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    l, r = [1e-6, 1e2]
    while r - l > 1e-6:
        eps = (l + r) / 2
        start = datetime.now()
        labels = cluster.DBSCAN(eps=eps, min_samples=2, leaf_size=30).fit_predict(df[['x', 'y']])
        end = datetime.now()
        label_count = len(set(labels))
        if label_count < answer:
            r = eps - 1e-6
        else:
            l = eps + 1e-6
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}\n' + \
           'eps: {:.1f}'.format(eps)
    ax.set_title(info)
    return len(df), (start, end)
    

benchmark(csv_files, 'DBSCAN', dbscan, figsize=(14, 16), wspace=0.2, hspace=0.5, top=0.9)

# Mean Shift

In [None]:
%%time

def meanshift(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    bandwidth = []
    for c in df.color.unique():
        ebw = cluster.estimate_bandwidth(df.loc[df.color == c, ['x', 'y']])
        bandwidth.append(ebw)
    bw = np.array(bandwidth).mean()
    labels = cluster.MeanShift(bandwidth=bw).fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}\n' + \
           'bandwidth={:.1f}'.format(bw)
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'Mean Shift', meanshift, figsize=(14, 16), wspace=0.2, hspace=0.5, top=0.9)

# Spectral Clustering

In [None]:
%%time

def spectral(df, ax):
#     if len(df) > 3000:
#         df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    labels = cluster.SpectralClustering(n_clusters=answer, affinity='nearest_neighbors', n_init=10).fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}\n' + \
           f'answer: {answer}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'Spectral Clustering (discretize)', spectral, figsize=(14, 16), wspace=0.2, hspace=0.3, top=0.9)

# Gaussian Mixture

In [None]:
%%time

from sklearn import mixture

def gaumix(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    labels = mixture.GaussianMixture(n_components=answer, max_iter=300, covariance_type='full').fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'Gaussian Mixture', gaumix, figsize=(14, 16), wspace=0.2, hspace=0.4, top=0.9)

# Agglomerative Clustering

In [None]:
%%time

def agglomerative(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    labels = cluster.AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', \
                                             linkage='ward', n_clusters=5) \
                    .fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'Agglomerative Clustering', agglomerative, figsize=(14, 16), wspace=0.2, hspace=0.4, top=0.9)

# OPTICS

In [None]:
%%time

def optics(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    clust = cluster.OPTICS(min_samples=20, xi=0.05, metric='euclidean') \
                   .fit(df[['x', 'y']])
    reachability = clust.reachability_[clust.ordering_]
    labels = clust.labels_[clust.ordering_]
    label_count = len(set(labels))
    df['label'] = labels
    df_found = df.loc[df[df.label != -1].index, ['x', 'y', 'label']]
    plot_df(df_found, df_found['label'], ax=ax, s=5)
    df_notfound = df.loc[df[df.label == -1].index, ['x', 'y', 'label']]
    ax.plot(df_notfound['x'], df_notfound['y'], 'k+', alpha=0.1, markersize=3)
    info = f'cluster: {label_count}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'OPTICS', optics, figsize=(14, 16), wspace=0.2, hspace=0.5, top=0.9)

# BIRCH

In [None]:
%%time

def birch(df, ax):
#     df = df.sample(frac=0.5)
    answer = len(df['color'].unique())
    labels = cluster.Birch(branching_factor=200, threshold=1, n_clusters=answer) \
                    .fit_predict(df[['x', 'y']])
    label_count = len(set(labels))
    plot_df(df, labels, ax=ax, s=5)
    info = f'cluster: {label_count}'
    ax.set_title(info)
    return len(df), None

benchmark(csv_files, 'BIRCH', birch, figsize=(14, 16), wspace=0.2, hspace=0.3, top=0.9)