In [None]:
%pip install rich

In [1]:
import math
import pandas as pd
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from rich.jupyter import print
from rich.table import Table
from rich.console import Console

console = Console()

In [2]:
def euclidean_dist(x, y, attrs):
    acc = 0
    for attr in attrs:
        acc += math.pow(x[attr] - y[attr], 2)
    return math.sqrt(acc)


def build_clusters(sklearn_labels, n_clusters):
    clusters = [[] for _ in range(n_clusters)]
    for i in range(len(sklearn_labels)):
        clusters[sklearn_labels[i]].append(i)
    return clusters


def silhouette(clusters, df, dist):
    point_sil_per_cluster = {}

    for cluster in range(len(clusters)):
        point_sil_per_cluster[cluster] = {}
        for i in range(len(clusters[cluster])):
            point = clusters[cluster][i]

            other_points = clusters[cluster].copy()
            del other_points[i]

            other_clusters = clusters.copy()
            del other_clusters[cluster]

            point_a = 0
            point_bs = []

            for other_point in other_points:
                point_a += dist(df.loc[point], df.loc[other_point], df.columns.values)
            point_a = point_a/len(clusters[cluster])

            for other_cluster in other_clusters:
                point_b = 0
                for other_point in other_cluster:
                    point_b += dist(df.loc[point], df.loc[other_point], df.columns.values)
                point_b = point_b/len(other_cluster)
                point_bs.append(point_b)

            point_b = min(point_bs)
            point_s = (point_b-point_a)/max(point_a, point_b)
            point_sil_per_cluster[cluster][point]=point_s

    clusters_sil = {}
    clustering_sil = 0
    for cluster in point_sil_per_cluster:
        clusters_sil[cluster] = 0

        for p in point_sil_per_cluster[cluster]:
            clusters_sil[cluster] += point_sil_per_cluster[cluster][p]
            clustering_sil += point_sil_per_cluster[cluster][p]

        clusters_sil[cluster] = clusters_sil[cluster]/len(point_sil_per_cluster[cluster])

    # clusters_sil["clustering silhouette"] = clustering_sil/len(df)
    return clustering_sil/len(df)

In [3]:
glass_executions = [
    {
        "dataset": "glass.csv",
        "algorithm": KMeans,
        "params": {"n_clusters": 7},
    },
    {
        "dataset": "glass.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 7, "linkage": "single"}
    },
    {
        "dataset": "glass.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 7, "linkage": "average"}
    },
    {
        "dataset": "glass.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 7, "linkage": "complete"}
    },
    {
        "dataset": "glass.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 7, "linkage": "ward"}
    },
    # TODO
    # {
    #     "dataset": "glass.csv",
    #     "algorithm": AffinityPropagation,
    #     "params": {},
    # },
]

user_knowledge_executions = [
    {
        "dataset": "glass.csv",
        "algorithm": KMeans,
        "params": {"n_clusters": 5},
    },
    {
        "dataset": "user-knowledge.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 5, "linkage": "single"}
    },
    {
        "dataset": "user-knowledge.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 5, "linkage": "average"}
    },
    {
        "dataset": "user-knowledge.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 5, "linkage": "complete"}
    },
    {
        "dataset": "user-knowledge.csv",
        "algorithm": AgglomerativeClustering,
        "params": {"n_clusters": 5, "linkage": "ward"}
    },
]

executions = glass_executions
# executions = glass_executions + user_knowledge_executions

table = Table()
table.add_column("Algorithm", style="bold", no_wrap=True)
table.add_column("Dataset", style="bold")
table.add_column("Parameters", style="bold", no_wrap=True)
table.add_column("Silhouette score (scikit-learn)", style="bold")
table.add_column("Silhouette score", style="bold")

for execution in executions:
    data = pd.read_csv(execution["dataset"])
    x = data.copy()
    y = x.pop("class")

    params = execution["params"]
    algorithm = execution["algorithm"]
    model = algorithm(**params)
    model.fit(x)
    n_clusters = params.get("n_clusters", len(y.unique()))
    clusters = build_clusters(model.labels_, n_clusters)
    silhouette_score_sklearn = silhouette_score(x, model.fit_predict(x))
    silhouette_score_custom = silhouette(clusters, x, euclidean_dist)

    table.add_row(
        algorithm.__name__,
        execution["dataset"],
        ", ".join([f"{k}={v}" for k,v in params.items()]),
        "{:.3f}".format(silhouette_score_sklearn),
        "{:.3f}".format(silhouette_score_custom),
    )
    
    # print(f"[bold]Algoritmo[/]: {algorithm.__name__}")
    # print(f"[bold]Dataset[/]: {execution['dataset']}")
    # print(f"[bold]Parameters[/]: {params}")
    # print(f"[bold]Silhouette score (scikit-learn)[/]: {silhouette_score(x, model.fit_predict(x))}")
    # print(f"[bold]Silhouette score[/]: {silhouette(clusters, x, euclidean_dist)}")
    # print()

console.print(table)
console.print()