In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import openml
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import f1_score, adjusted_rand_score, silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS, AffinityPropagation, MeanShift, SpectralClustering, Birch
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
from time import time
import csv

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score, adjusted_rand_score, silhouette_score
from sklearn.model_selection import ParameterGrid
from time import time
import pandas as pd
from joblib import Parallel, delayed


In [2]:
# Define the algorithms and their parameter grids
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'preference': [-50, -10, -1]}}

]

In [3]:
# Define the function to run a clustering algorithm with each parameter combination
def run_algorithm(params):
    algorithm_name = params['name']
    algorithm_class = params['class']
    param_grid = params['param_grid']
    results = []
    for param_comb in ParameterGrid(param_grid):
        algorithm = algorithm_class(**param_comb)
        start_time = time()
        algorithm.fit(X)
        exec_time = time() - start_time
        if algorithm_name == 'GaussianMixture':
            labels = algorithm.predict(X)
        else:
            labels = algorithm.labels_
        if len(set(labels)) > 1: # check if there are at least 2 clusters found
            f1 = f1_score(y, labels, average='weighted')
            adj_rand = adjusted_rand_score(y, labels)
            silhouette = silhouette_score(X, labels)
            result = {'Algorithm': algorithm_name, 'Parameters': param_comb, 'F1 Score': f1, 'Adjusted Rand Score': adj_rand, 'Silhouette Score': silhouette, 'Execution Time': exec_time}
            results.append(result)
    return results


In [4]:
# Define the function to parallelize the algorithm runs
def run_parallel(algorithms):
    results = Parallel(n_jobs=-1)(delayed(run_algorithm)(params) for params in algorithms)
    results = [item for sublist in results for item in sublist]
    return results

In [5]:
# Run all the algorithms in parallel and save the results to a CSV file
#results = run_parallel(algorithms)
#results_df = pd.DataFrame(results)
#results_df.to_csv('clustering_results_iris.csv', index=False) 

In [5]:
le = LabelEncoder()
scaler = MinMaxScaler()


IRIS

In [11]:
iris = openml.datasets.get_dataset(61)
X, y, categorical_indicator, attribute_names = iris.get_data(target=iris.default_target_attribute)

In [12]:
# Convert the categorical target variable to numerical using LabelEncoder

y = le.fit_transform(y)

#MinMax scaling
X = scaler.fit_transform(X)

In [14]:

# Define the algorithms and their parameter grids
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'preference': [-50, -10, -1]}},


]
# Define the function to run a clustering algorithm with each parameter combination
def run_algorithm(params):
    algorithm_name = params['name']
    algorithm_class = params['class']
    param_grid = params['param_grid']
    results = []
    for param_comb in ParameterGrid(param_grid):
        algorithm = algorithm_class(**param_comb)
        start_time = time()
        algorithm.fit(X)
        exec_time = time() - start_time
        if algorithm_name == 'GaussianMixture':
            labels = algorithm.predict(X)
        else:
            labels = algorithm.labels_
        if len(set(labels)) > 1: # check if there are at least 2 clusters found
            f1 = f1_score(y, labels, average='weighted')
            adj_rand = adjusted_rand_score(y, labels)
            silhouette = silhouette_score(X, labels)
            result = {'Algorithm': algorithm_name, 'Parameters': param_comb, 'F1 Score': f1, 'Adjusted Rand Score': adj_rand, 'Silhouette Score': silhouette, 'Execution Time': exec_time}
            results.append(result)
    return results

# Define the function to parallelize the algorithm runs
def run_parallel(algorithms):
    results = Parallel(n_jobs=-1)(delayed(run_algorithm)(params) for params in algorithms)
    results = [item for sublist in results for item in sublist]
    return results
# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_iris.csv', index=False) 


Glass

In [19]:
glass = openml.datasets.get_dataset(41)
X, y, categorical_indicator, attribute_names = glass.get_data(target=glass.default_target_attribute)

In [20]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)


In [18]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_glass.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.258429,0.209236,0.520131,0.342974
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.205367,0.241745,0.525244,0.355728
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.257615,0.184371,0.360597,0.330861
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.258063,0.202337,0.524074,0.296937
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.284988,0.233520,0.526382,0.294612
...,...,...,...,...,...,...
139,AffinityPropagation,"{'damping': 0.9, 'preference': -1}",0.197355,0.184051,0.376010,0.176306
140,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.194517,0.198746,0.521353,0.015621
141,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.219883,0.219659,0.526722,0.015627
142,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.179583,0.163355,0.304983,0.007999


Wine

In [6]:
wine = openml.datasets.get_dataset(187)
X, y, categorical_indicator, attribute_names = wine.get_data(target=wine.default_target_attribute)

In [7]:
# Convert the categorical target variable to numerical using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

#MinMax scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [23]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_wine.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.358678,0.370227,0.298722,0.363267
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.044320,0.853660,0.300894,0.329254
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.000000,0.750917,0.259937,0.336168
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.358678,0.370227,0.298722,0.318572
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.364228,0.868543,0.301346,0.287296
...,...,...,...,...,...,...
134,AffinityPropagation,"{'damping': 0.9, 'preference': -1}",0.119694,0.323418,0.112200,0.103678
135,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.222343,0.484739,0.296668,0.015625
136,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.023385,0.931000,0.294829,0.015626
137,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.300010,0.769743,0.248703,0.015627


Haberman

In [24]:
haberman = openml.datasets.get_dataset(43)
X, y, categorical_indicator, attribute_names = haberman.get_data(target=haberman.default_target_attribute)

In [25]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)


In [39]:
# Define the algorithms and their parameter grids
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}}

]
# Define the function to run a clustering algorithm with each parameter combination
def run_algorithm(params):
    algorithm_name = params['name']
    algorithm_class = params['class']
    param_grid = params['param_grid']
    results = []
    for param_comb in ParameterGrid(param_grid):
        algorithm = algorithm_class(**param_comb)
        start_time = time()
        algorithm.fit(X)
        exec_time = time() - start_time
        if algorithm_name == 'GaussianMixture':
            labels = algorithm.predict(X)
        elif algorithm_name == 'AffinityPropogation':
            labels = algorithm.fit_predict(X)
        else:
            labels = algorithm.labels_
        if len(set(labels)) > 1: # check if there are at least 2 clusters found
            f1 = f1_score(y, labels, average='weighted')
            adj_rand = adjusted_rand_score(y, labels)
            silhouette = silhouette_score(X, labels)
            result = {'Algorithm': algorithm_name, 'Parameters': param_comb, 'F1 Score': f1, 'Adjusted Rand Score': adj_rand, 'Silhouette Score': silhouette, 'Execution Time': exec_time}
            results.append(result)
    return results

# Define the function to parallelize the algorithm runs
def run_parallel(algorithms):
    results = Parallel(n_jobs=-1)(delayed(run_algorithm)(params) for params in algorithms)
    results = [item for sublist in results for item in sublist]
    return results

In [40]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_haberman.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.535175,-0.003979,0.386881,0.315294
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.497758,-0.000244,0.334772,0.312081
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.304640,-0.003318,0.292412,0.304084
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.521641,-0.003979,0.386881,0.251553
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.374317,-0.000244,0.334772,0.232062
...,...,...,...,...,...,...
124,AffinityPropagation,"{'damping': 0.9, 'max_iter': 300}",0.513106,-0.006867,0.380452,0.232149
125,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.581446,-0.002130,0.322948,0.016006
126,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.458133,-0.003618,0.212171,0.008001
127,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.406021,0.004170,0.235082,0.016000


In [26]:
DBScan=[ {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5]}} ]

run_parallel(DBScan)

# Run all the algorithms in parallel and save the results to a CSV file
results_df= pd.read_csv("clustering_results_haberman.csv")
resultsDB = run_parallel(DBScan)
#DB_df=pd.DataFrame(resultsDB)
#resultsdf=pd.concat([results_df, DB_df], ignore_index=True)

In [27]:
resultsDB

[]

Libras_Move

In [41]:
libras = openml.datasets.get_dataset(299)
X, y, categorical_indicator, attribute_names = libras.get_data(target=libras.default_target_attribute)

In [45]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)


In [54]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}}

]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_libras.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.016749,0.066410,0.215113,0.390684
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.018033,0.059373,0.173988,0.389390
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.022245,0.123315,0.184316,0.343802
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.017034,0.066362,0.212546,0.328545
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.025053,0.054719,0.173943,0.312544
...,...,...,...,...,...,...
146,Birch,"{'branching_factor': 100, 'n_clusters': 4, 'th...",0.036311,0.091936,0.144812,0.015626
147,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.015034,0.030265,0.192945,0.015630
148,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.019935,0.098010,0.156322,0.015626
149,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.032099,0.126391,0.168676,0.031254


In [58]:
Affinity=[{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}]

run_parallel(Affinity)

# Run all the algorithms in parallel and save the results to a CSV file
resultsaffinity = run_parallel(Affinity)
affinity_df=pd.DataFrame(resultsaffinity)
resultsdf=pd.concat([results_df, affinity_df], ignore_index=True)

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.016749,0.066410,0.215113,0.390684
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.018033,0.059373,0.173988,0.389390
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.022245,0.123315,0.184316,0.343802
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.017034,0.066362,0.212546,0.328545
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.025053,0.054719,0.173943,0.312544
...,...,...,...,...,...,...
146,Birch,"{'branching_factor': 100, 'n_clusters': 4, 'th...",0.036311,0.091936,0.144812,0.015626
147,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.015034,0.030265,0.192945,0.015630
148,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.019935,0.098010,0.156322,0.015626
149,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.032099,0.126391,0.168676,0.031254


In [59]:
resultsdf.to_csv('clustering_results_libras.csv', index=False) 
resultsdf

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.016749,0.066410,0.215113,0.390684
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.018033,0.059373,0.173988,0.389390
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.022245,0.123315,0.184316,0.343802
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.017034,0.066362,0.212546,0.328545
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.025053,0.054719,0.173943,0.312544
...,...,...,...,...,...,...
155,AffinityPropagation,"{'damping': 0.7, 'max_iter': 200}",0.025766,0.285203,0.244562,0.899287
156,AffinityPropagation,"{'damping': 0.7, 'max_iter': 300}",0.025766,0.286299,0.243310,0.667682
157,AffinityPropagation,"{'damping': 0.9, 'max_iter': 100}",0.027927,0.252278,0.212953,0.861762
158,AffinityPropagation,"{'damping': 0.9, 'max_iter': 200}",0.028117,0.291664,0.245192,1.310911


Satelite_image

In [46]:
satelite = openml.datasets.get_dataset(294)
X, y, categorical_indicator, attribute_names = satelite.get_data(target=satelite.default_target_attribute)
X = scaler.fit_transform(X)
y = le.fit_transform(y)

In [9]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_satelite.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.136533,0.184602,0.367856,0.388788
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.109159,0.281667,0.432119,0.437569
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.207356,0.443511,0.402802,0.500073
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.056367,0.184602,0.367856,0.374242
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.109159,0.281667,0.432119,0.469654
5,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.011304,0.443511,0.402802,0.528537
6,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.056367,0.184602,0.367856,0.35943
7,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.109159,0.281667,0.432119,0.498281
8,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.045925,0.443511,0.402802,0.680606
9,KMeans,"{'init': 'random', 'max_iter': 100, 'n_cluster...",0.056367,0.184602,0.367856,0.328176


In [None]:
algorithms = [
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'preference': [-50, -10, -1]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results_test = run_parallel(algorithms)
results_df_test = pd.DataFrame(results_test)


In [51]:
results_df_test

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.022107,0.262923,0.310767,6.821466
1,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.146997,0.335557,0.374293,6.741825
2,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.0431,0.486306,0.362385,6.820058
3,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.292215,0.623345,0.335602,6.634841


In [52]:
#results_df=pd.read_csv('clustering_results_satelite.csv')
results_df=pd.concat([results_df, results_df_test], ignore_index=True)
results_df.to_csv('clustering_results_satelite.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.136533,0.184602,0.367856,0.388788
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.109159,0.281667,0.432119,0.437569
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.207356,0.443511,0.402802,0.500073
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.056367,0.184602,0.367856,0.374242
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.109159,0.281667,0.432119,0.469654
...,...,...,...,...,...,...
133,Birch,"{'branching_factor': 100, 'n_clusters': 4, 'th...",0.265058,0.277969,0.291089,0.453190
134,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.022107,0.262923,0.310767,6.821466
135,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.146997,0.335557,0.374293,6.741825
136,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.043100,0.486306,0.362385,6.820058


isolet

In [11]:
isolet = openml.datasets.get_dataset(300)
X, y, categorical_indicator, attribute_names = isolet.get_data(target=isolet.default_target_attribute)

In [12]:
X

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f608,f609,f610,f611,f612,f613,f614,f615,f616,f617
0,-0.4394,-0.0930,0.1718,0.4620,0.6226,0.4704,0.3578,0.0478,-0.1184,-0.2310,...,0.3334,0.4102,0.2052,0.3846,0.3590,0.5898,0.3334,0.6410,0.5898,-0.4872
1,-0.4348,-0.1198,0.2474,0.4036,0.5026,0.6328,0.4948,0.0338,-0.0520,-0.1302,...,0.2272,0.0000,0.2954,0.2046,0.4772,0.0454,0.2046,0.4318,0.4546,-0.0910
2,-0.2330,0.2124,0.5014,0.5222,-0.3422,-0.5840,-0.7168,-0.6342,-0.8614,-0.8318,...,0.0952,-0.1112,-0.0476,-0.1746,0.0318,-0.0476,0.1112,0.2540,0.1588,-0.4762
3,-0.3808,-0.0096,0.2602,0.2554,-0.4290,-0.6746,-0.6868,-0.6650,-0.8410,-0.9614,...,0.0648,-0.0504,-0.0360,-0.1224,0.1366,0.2950,0.0792,-0.0072,0.0936,-0.1510
4,-0.3412,0.0946,0.6082,0.6216,-0.1622,-0.3784,-0.4324,-0.4358,-0.4966,-0.5406,...,0.2812,0.1562,0.3124,0.2500,-0.0938,0.1562,0.3124,0.3124,0.2188,-0.2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7792,-0.6842,-0.3280,-0.1984,0.2956,0.8786,0.8948,0.3118,0.1822,0.1012,0.1740,...,0.8928,0.7738,0.7738,0.7142,0.6428,0.5952,0.5714,0.3928,0.4286,0.2858
7793,-0.5912,-0.2420,0.8174,1.0000,0.4642,0.6428,0.6944,0.3056,-0.3888,-0.6826,...,0.0192,0.1924,-0.1154,0.0192,0.2116,-0.0384,0.0192,-0.2308,-0.4230,-0.7116
7794,-0.6696,-0.3730,0.1584,0.8910,1.0000,0.9762,0.9762,0.7684,0.4106,0.0154,...,0.0000,0.0910,0.1818,0.2000,0.1454,0.0182,-0.2910,0.0728,0.0728,-0.5818
7795,-0.5764,-0.1764,0.5106,0.3742,-0.1670,-0.5858,-0.7882,-0.7224,-0.6330,-0.8212,...,0.3044,0.4130,0.5870,0.4348,0.5652,0.3478,-0.0434,0.3044,-0.0434,-0.5000


In [None]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    #{'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_isolet.csv', index=False) 
results_df

nursery

In [19]:
nursery = openml.datasets.get_dataset(26)
X, y, categorical_indicator, attribute_names = nursery.get_data(target=nursery.default_target_attribute)


In [16]:
X

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended
1,usual,proper,complete,1,convenient,convenient,nonprob,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority
...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority


gas-drift-different-concentrations

In [14]:
gasdrift = openml.datasets.get_dataset(1477)
X, y, categorical_indicator, attribute_names = gasdrift.get_data(target=gasdrift.default_target_attribute)

In [12]:
X

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129
0,12285.6582,4.076635,4.842317,7.509393,10.822436,-1.312657,-1.853717,-6.924985,11800.9233,4.483500,...,-3.440387,1784.5324,1.907000,1.729200,4.881194,8.623828,-0.314110,-0.661556,-3.521663,10.0
1,-35.6889,0.993944,0.166099,0.489363,3.484663,-0.130298,-0.528364,-3.735347,266.4145,1.053988,...,-4.482534,904.9898,1.433707,1.068069,2.532958,5.369720,-0.183779,-0.534087,-4.635975,50.0
2,63927.2217,14.956941,19.971376,29.188512,33.291320,-10.433776,-16.062245,-49.490143,57405.8483,15.613843,...,-25.150090,14585.7879,8.189021,6.099452,12.127991,15.709651,-3.887082,-6.731473,-19.326895,250.0
3,2992.9019,1.380553,0.808910,1.288259,4.660135,-0.755903,-1.120470,-4.075213,4301.4033,1.652701,...,-7.215792,6044.5554,3.488295,2.662288,5.938297,8.544508,-1.567322,-2.701235,-6.472439,600.0
4,57524.7812,11.912566,14.631496,19.809240,23.715868,-9.084750,-11.770585,-39.234003,50051.0703,11.732548,...,-10.248794,10580.1006,5.752675,3.880740,8.545897,11.831716,-2.655521,-4.312744,-8.510591,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,13384.8262,2.820931,4.007378,6.618008,11.386095,-2.142994,-3.110327,-11.296786,19013.4575,4.050907,...,-8.012662,7548.3501,2.377836,3.400734,8.571174,11.555882,-1.617656,-2.457614,-6.226359,10.0
13906,13382.9619,2.825174,4.010915,6.483989,10.356127,-2.156512,-3.179563,-10.184803,19034.2495,4.066463,...,-6.943002,7510.4946,2.364505,3.401381,8.512949,12.149638,-1.613554,-2.493870,-6.859804,10.0
13907,13336.8725,2.822288,3.980818,6.487103,10.936979,-2.146688,-3.273109,-11.067489,18997.7222,4.055524,...,-7.777268,7530.0010,2.369898,3.400592,8.494436,11.839013,-1.612525,-2.504918,-6.263872,10.0
13908,13351.1318,2.824358,3.987819,6.554427,11.331002,-2.143651,-3.257854,-11.795109,19035.9926,4.071607,...,-6.890286,7599.0201,2.391834,3.358804,8.457260,11.297346,-1.606879,-2.438701,-6.044784,10.0


In [15]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)

In [27]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [ 3, 4,5,6,7], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [ 5, 7,9,12]}},
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_gas-drift-different-concentrations.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.054097,0.033598,0.42576,2.826405
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.122173,0.060306,0.313437,2.311292
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.213043,0.080008,0.338998,3.462056
3,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.09914,0.060097,0.280078,5.479806
4,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.064159,0.067602,0.289693,3.792276
5,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.054097,0.033598,0.42576,2.997726
6,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.090835,0.049558,0.313911,2.235829
7,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.123098,0.080244,0.338877,4.656995
8,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.209457,0.060097,0.280078,5.213085
9,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.191127,0.055554,0.287013,6.725141


In [None]:
algorithms = [
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [5, 7,9,12], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [3, 4,5,6,7], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [3, 4,5,6,7], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [3, 4,5,6,7], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [3, 4,5,6,7], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results_test = run_parallel(algorithms)
results_df_test = pd.DataFrame(results_test)


In [10]:
#results_df=pd.read_csv('clustering_results_gas-drift-different-concentrations.csv')
results_df=pd.concat([results_df, results_df_test], ignore_index=True)
results_df.to_csv('clustering_results_gas-drift-different-concentrations.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.054097,0.033598,0.425760,2.826405
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.122173,0.060306,0.313437,2.311292
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.213043,0.080008,0.338998,3.462056
3,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.099140,0.060097,0.280078,5.479806
4,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.064159,0.067602,0.289693,3.792276
...,...,...,...,...,...,...
123,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.124823,0.013878,0.456771,94.206886
124,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.148242,0.049408,0.246709,97.885261
125,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.158495,0.083232,0.269274,89.278146
126,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.154495,0.070600,0.265695,88.861546


MagicTelescope

In [11]:
magictelescope = openml.datasets.get_dataset(1120)
X, y, categorical_indicator, attribute_names = magictelescope.get_data(target=magictelescope.default_target_attribute)
X = scaler.fit_transform(X)
y = le.fit_transform(y)

In [26]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_MagicTelescope.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.419946,0.030641,0.31808,1.203304
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.573619,0.08948,0.292159,1.218934
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.389995,0.124528,0.29708,0.941666
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.419815,0.030702,0.31811,0.515705
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.424934,0.08948,0.292159,0.951889
5,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.458175,0.124328,0.297039,0.9716
6,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.598888,0.031789,0.318394,0.562585
7,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.217491,0.08948,0.292159,1.188869
8,KMeans,"{'init': 'k-means++', 'max_iter': 500, 'n_clus...",0.388878,0.124862,0.29732,0.982883
9,KMeans,"{'init': 'random', 'max_iter': 100, 'n_cluster...",0.418257,0.031576,0.318473,0.790881


In [None]:
algorithms = [
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results_test = run_parallel(algorithms)
results_df_test = pd.DataFrame(results_test)


In [9]:
results_df_test

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,Birch,"{'branching_factor': 20, 'n_clusters': 2, 'thr...",0.607158,0.085451,0.373668,7.155533
1,Birch,"{'branching_factor': 20, 'n_clusters': 2, 'thr...",0.371201,0.061935,0.329591,1.203655
2,Birch,"{'branching_factor': 20, 'n_clusters': 3, 'thr...",0.602123,0.208577,0.311366,7.989457
3,Birch,"{'branching_factor': 20, 'n_clusters': 3, 'thr...",0.631834,0.061935,0.329591,0.953266
4,Birch,"{'branching_factor': 20, 'n_clusters': 4, 'thr...",0.243713,0.098066,0.233556,6.12692
5,Birch,"{'branching_factor': 20, 'n_clusters': 4, 'thr...",0.631834,0.061935,0.329591,1.080737
6,Birch,"{'branching_factor': 50, 'n_clusters': 2, 'thr...",0.720537,0.212479,0.244196,6.717854
7,Birch,"{'branching_factor': 50, 'n_clusters': 2, 'thr...",0.371201,0.061935,0.329591,2.240638
8,Birch,"{'branching_factor': 50, 'n_clusters': 3, 'thr...",0.156159,0.132017,0.29906,4.500657
9,Birch,"{'branching_factor': 50, 'n_clusters': 3, 'thr...",0.631834,0.061935,0.329591,0.953266


In [10]:
#results_df=pd.read_csv('clustering_results_MagicTelescope.csv')
results_df=pd.concat([results_df, results_df_test], ignore_index=True)
results_df.to_csv('clustering_results_MagicTelescope.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.419946,0.030641,0.318080,1.203304
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.573619,0.089480,0.292159,1.218934
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.389995,0.124528,0.297080,0.941666
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.419815,0.030702,0.318110,0.515705
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.424934,0.089480,0.292159,0.951889
...,...,...,...,...,...,...
74,Birch,"{'branching_factor': 100, 'n_clusters': 4, 'th...",0.631834,0.061935,0.329591,1.101094
75,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.497792,0.000092,0.283314,219.737399
76,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.517578,0.045846,0.259542,196.648028
77,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.230720,0.089493,0.277987,132.059128


Letter

In [7]:
letter = openml.datasets.get_dataset(6)
X, y, categorical_indicator, attribute_names = letter.get_data(target=letter.default_target_attribute)
X = scaler.fit_transform(X)
y = le.fit_transform(y)

In [12]:
y

array([25, 15, 18, ..., 14, 11, 16])

In [13]:
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [5, 10,15,20,25], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5]}},
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_letter.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.013316,0.042302,0.140649,1.657668
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.030363,0.070626,0.152299,3.476587
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.029066,0.10188,0.152406,4.464057
3,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.027914,0.113247,0.139732,5.167371
4,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.02755,0.126599,0.149781,6.103306
5,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.011942,0.042291,0.140643,2.63817
6,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.02431,0.070821,0.152425,3.136374
7,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.014743,0.102449,0.147063,4.807903
8,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.035524,0.115299,0.139261,7.192883
9,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.021604,0.132173,0.147661,7.584449


In [10]:
algorithms = [
    #{'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [10, 20, 30,40,50]}},
    #{'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [5, 10, 15, 20, 25, 30], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    #{'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    #{'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5]}},
    #{'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4,5, 10, 15, 20, 25], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15,20]}},
    #{'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [5, 10,15,20,25], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    #{'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [5, 10,15,20,25], 'affinity': ['euclidean'], 'linkage': ['ward']}},
    #{'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'max_iter': [100, 200, 300]}}
]

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results_test = run_parallel(algorithms)
results_df_test = pd.DataFrame(results_test)


In [11]:
results_df_test

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.010655,0.054671,0.094759,69.190319
1,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.022636,0.087367,0.102931,68.643287
2,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.006177,0.121574,0.107534,69.410315
3,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.020935,0.133515,0.096462,68.067957
4,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.065141,0.143786,0.112374,68.257344


In [12]:
#results_df=pd.read_csv('clustering_results_letter.csv')
results_df=pd.concat([results_df, results_df_test], ignore_index=True)
results_df.to_csv('clustering_results_letter.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.013316,0.042302,0.140649,1.657668
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.030363,0.070626,0.152299,3.476587
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.029066,0.101880,0.152406,4.464057
3,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.027914,0.113247,0.139732,5.167371
4,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.027550,0.126599,0.149781,6.103306
...,...,...,...,...,...,...
101,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.010655,0.054671,0.094759,69.190319
102,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.022636,0.087367,0.102931,68.643287
103,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.006177,0.121574,0.107534,69.410315
104,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.020935,0.133515,0.096462,68.067957


Covertype

In [44]:
covertype = openml.datasets.get_dataset(150)
X, y, categorical_indicator, attribute_names = covertype.get_data(target=covertype.default_target_attribute)

In [45]:
X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366,...,0,0,0,0,0,0,0,0,0,0
1,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838,...,0,0,0,0,0,0,0,0,0,0
2,0.472736,0.386111,0.136364,0.191840,0.307494,0.446817,0.921260,0.937008,0.531496,0.853339,...,0,0,0,0,0,0,0,0,0,0
3,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886,...,0,0,0,0,0,0,0,0,0,0
4,0.368184,0.125000,0.030303,0.109520,0.222222,0.054939,0.866142,0.921260,0.590551,0.860449,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,0.268634,0.425000,0.303030,0.060845,0.245478,0.015175,0.944882,0.933071,0.464567,0.116688,...,0,0,0,0,0,0,0,0,0,0
581008,0.266133,0.422222,0.287879,0.047960,0.239018,0.013348,0.944882,0.933071,0.468504,0.117803,...,0,0,0,0,0,0,0,0,0,0
581009,0.263632,0.441667,0.257576,0.042949,0.232558,0.012646,0.929134,0.948819,0.511811,0.119058,...,0,0,0,0,0,0,0,0,0,0
581010,0.262631,0.472222,0.227273,0.042949,0.229974,0.012646,0.905512,0.964567,0.562992,0.120452,...,0,0,0,0,0,0,0,0,0,0
