In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import openml
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import f1_score, adjusted_rand_score, silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS, AffinityPropagation, MeanShift, SpectralClustering, Birch
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
from time import time
import csv

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import f1_score, adjusted_rand_score, silhouette_score
from sklearn.model_selection import ParameterGrid
from time import time
import pandas as pd
from joblib import Parallel, delayed


In [8]:
# Define the algorithms and their parameter grids
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'preference': [-50, -10, -1]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}}

]

In [9]:
# Define the function to run a clustering algorithm with each parameter combination
def run_algorithm(params):
    algorithm_name = params['name']
    algorithm_class = params['class']
    param_grid = params['param_grid']
    results = []
    for param_comb in ParameterGrid(param_grid):
        algorithm = algorithm_class(**param_comb)
        start_time = time()
        algorithm.fit(X)
        exec_time = time() - start_time
        if algorithm_name == 'GaussianMixture':
            labels = algorithm.predict(X)
        else:
            labels = algorithm.labels_
        if len(set(labels)) > 1: # check if there are at least 2 clusters found
            f1 = f1_score(y, labels, average='weighted')
            adj_rand = adjusted_rand_score(y, labels)
            silhouette = silhouette_score(X, labels)
            result = {'Algorithm': algorithm_name, 'Parameters': param_comb, 'F1 Score': f1, 'Adjusted Rand Score': adj_rand, 'Silhouette Score': silhouette, 'Execution Time': exec_time}
            results.append(result)
    return results


In [10]:
# Define the function to parallelize the algorithm runs
def run_parallel(algorithms):
    results = Parallel(n_jobs=-1)(delayed(run_algorithm)(params) for params in algorithms)
    results = [item for sublist in results for item in sublist]
    return results

In [24]:
# Run all the algorithms in parallel and save the results to a CSV file
#results = run_parallel(algorithms)
#results_df = pd.DataFrame(results)
#results_df.to_csv('clustering_results_iris.csv', index=False) 

IRIS

In [11]:
iris = openml.datasets.get_dataset(61)
X, y, categorical_indicator, attribute_names = iris.get_data(target=iris.default_target_attribute)

In [12]:
# Convert the categorical target variable to numerical using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

#MinMax scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [14]:

# Define the algorithms and their parameter grids
algorithms = [
    {'name': 'KMeans', 'class': KMeans, 'param_grid': {'n_clusters': [2, 3, 4], 'init': ['k-means++', 'random'], 'max_iter': [100, 300, 500]}},
    {'name': 'AgglomerativeClustering', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean', 'manhattan', 'cosine'], 'linkage': ['complete', 'average']}},
    {'name': 'DBSCAN', 'class': DBSCAN, 'param_grid': {'eps': [0.5, 1.0, 1.5], 'min_samples': [3, 5, 7]}},
    {'name': 'OPTICS', 'class': OPTICS, 'param_grid': {'min_samples': [3, 5, 7], 'xi': [0.01, 0.05, 0.1]}},
    {'name': 'GaussianMixture', 'class': GaussianMixture, 'param_grid': {'n_components': [2, 3, 4], 'covariance_type': ['full', 'tied', 'diag', 'spherical'], 'max_iter': [100, 300, 500]}},
    {'name': 'MeanShift', 'class': MeanShift, 'param_grid': {'bandwidth': [0.5, 1.0, 1.5], 'cluster_all': [True, False]}},
    {'name': 'SpectralClustering', 'class': SpectralClustering, 'param_grid': {'n_clusters': [2, 3, 4], 'affinity': ['nearest_neighbors', 'rbf'], 'n_neighbors': [5, 10, 15]}},
    {'name': 'Birch', 'class': Birch, 'param_grid': {'n_clusters': [2, 3, 4], 'threshold': [0.1, 0.5, 1.0], 'branching_factor': [20, 50, 100]}},
    {'name': 'AffinityPropagation', 'class': AffinityPropagation, 'param_grid': {'damping': [0.5, 0.7, 0.9], 'preference': [-50, -10, -1]}},
    {'name': 'Ward', 'class': AgglomerativeClustering, 'param_grid': {'n_clusters': [2, 3, 4, 5], 'affinity': ['euclidean'], 'linkage': ['ward']}}

]
# Define the function to run a clustering algorithm with each parameter combination
def run_algorithm(params):
    algorithm_name = params['name']
    algorithm_class = params['class']
    param_grid = params['param_grid']
    results = []
    for param_comb in ParameterGrid(param_grid):
        algorithm = algorithm_class(**param_comb)
        start_time = time()
        algorithm.fit(X)
        exec_time = time() - start_time
        if algorithm_name == 'GaussianMixture':
            labels = algorithm.predict(X)
        else:
            labels = algorithm.labels_
        if len(set(labels)) > 1: # check if there are at least 2 clusters found
            f1 = f1_score(y, labels, average='weighted')
            adj_rand = adjusted_rand_score(y, labels)
            silhouette = silhouette_score(X, labels)
            result = {'Algorithm': algorithm_name, 'Parameters': param_comb, 'F1 Score': f1, 'Adjusted Rand Score': adj_rand, 'Silhouette Score': silhouette, 'Execution Time': exec_time}
            results.append(result)
    return results

# Define the function to parallelize the algorithm runs
def run_parallel(algorithms):
    results = Parallel(n_jobs=-1)(delayed(run_algorithm)(params) for params in algorithms)
    results = [item for sublist in results for item in sublist]
    return results
# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_iris.csv', index=False) 


Glass

In [19]:
glass = openml.datasets.get_dataset(41)
X, y, categorical_indicator, attribute_names = glass.get_data(target=glass.default_target_attribute)

In [20]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)


In [18]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_glass.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.258429,0.209236,0.520131,0.342974
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.205367,0.241745,0.525244,0.355728
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.257615,0.184371,0.360597,0.330861
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.258063,0.202337,0.524074,0.296937
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.284988,0.233520,0.526382,0.294612
...,...,...,...,...,...,...
139,AffinityPropagation,"{'damping': 0.9, 'preference': -1}",0.197355,0.184051,0.376010,0.176306
140,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.194517,0.198746,0.521353,0.015621
141,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.219883,0.219659,0.526722,0.015627
142,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.179583,0.163355,0.304983,0.007999


Wine

In [21]:
wine = openml.datasets.get_dataset(187)
X, y, categorical_indicator, attribute_names = wine.get_data(target=wine.default_target_attribute)

In [22]:
# Convert the categorical target variable to numerical using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

#MinMax scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [23]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_wine.csv', index=False) 
results_df

Unnamed: 0,Algorithm,Parameters,F1 Score,Adjusted Rand Score,Silhouette Score,Execution Time
0,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.358678,0.370227,0.298722,0.363267
1,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.044320,0.853660,0.300894,0.329254
2,KMeans,"{'init': 'k-means++', 'max_iter': 100, 'n_clus...",0.000000,0.750917,0.259937,0.336168
3,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.358678,0.370227,0.298722,0.318572
4,KMeans,"{'init': 'k-means++', 'max_iter': 300, 'n_clus...",0.364228,0.868543,0.301346,0.287296
...,...,...,...,...,...,...
134,AffinityPropagation,"{'damping': 0.9, 'preference': -1}",0.119694,0.323418,0.112200,0.103678
135,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.222343,0.484739,0.296668,0.015625
136,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.023385,0.931000,0.294829,0.015626
137,Ward,"{'affinity': 'euclidean', 'linkage': 'ward', '...",0.300010,0.769743,0.248703,0.015627


Haberman

In [193]:
haberman = openml.datasets.get_dataset(43)
X, y, categorical_indicator, attribute_names = haberman.get_data(target=haberman.default_target_attribute)

In [196]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)


In [197]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_wine.csv', index=False) 
results_df

ValueError: Number of labels is 306. Valid values are 2 to n_samples - 1 (inclusive)

Libras_Move

In [229]:
libras = openml.datasets.get_dataset(299)
X, y, categorical_indicator, attribute_names = libras.get_data(target=libras.default_target_attribute)

In [230]:
X

Unnamed: 0,xcoord1,ycoord1,xcoord2,ycoord2,xcoord3,ycoord3,xcoord4,ycoord4,xcoord5,ycoord5,...,xcoord41,ycoord41,xcoord42,ycoord42,xcoord43,ycoord43,xcoord44,ycoord44,xcoord45,ycoord45
0,0.79691,0.38194,0.79691,0.37731,0.79884,0.37731,0.79497,0.37731,0.77563,0.35417,...,0.34043,0.51389,0.39845,0.42593,0.47389,0.36111,0.55899,0.31250,0.63830,0.29398
1,0.67892,0.27315,0.68085,0.27315,0.68085,0.27315,0.68085,0.27315,0.67892,0.26852,...,0.19536,0.57407,0.17795,0.63657,0.17215,0.67361,0.17021,0.69213,0.17215,0.69213
2,0.72147,0.23611,0.72340,0.23611,0.72340,0.23611,0.72340,0.23611,0.72340,0.23611,...,0.52031,0.30556,0.59768,0.25926,0.67118,0.25231,0.73501,0.26620,0.78143,0.27778
3,0.56480,0.32407,0.56286,0.32407,0.56093,0.32407,0.55899,0.32407,0.55899,0.32407,...,0.20503,0.49074,0.26306,0.42361,0.33269,0.34722,0.41006,0.28009,0.49130,0.24306
4,0.67118,0.38426,0.67118,0.38657,0.67311,0.38657,0.67311,0.38426,0.67311,0.37963,...,0.46422,0.76389,0.44101,0.64120,0.45068,0.54167,0.47776,0.44213,0.53191,0.34259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,0.65957,0.79167,0.65764,0.78704,0.65957,0.78935,0.65957,0.78704,0.65764,0.78241,...,0.58221,0.55093,0.57253,0.53935,0.56286,0.53241,0.55126,0.52546,0.54159,0.52083
356,0.64023,0.71991,0.64217,0.71759,0.64217,0.71759,0.64217,0.71759,0.64217,0.71991,...,0.56480,0.50463,0.55513,0.50231,0.54352,0.49769,0.53191,0.49537,0.52031,0.49306
357,0.61122,0.75926,0.61122,0.75694,0.61315,0.75694,0.61122,0.75694,0.61122,0.75926,...,0.49710,0.54167,0.48549,0.53472,0.47195,0.52546,0.45841,0.52083,0.44487,0.51620
358,0.65957,0.79167,0.65764,0.78704,0.65957,0.78935,0.65957,0.78704,0.65764,0.78241,...,0.58221,0.55093,0.57253,0.53935,0.56286,0.53241,0.55126,0.52546,0.54159,0.52083


Satelite_image

In [231]:
satelite = openml.datasets.get_dataset(294)
X, y, categorical_indicator, attribute_names = satelite.get_data(target=satelite.default_target_attribute)

In [232]:
X

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6,attr7,attr8,attr9,attr10,...,attr27,attr28,attr29,attr30,attr31,attr32,attr33,attr34,attr35,attr36
0,92,115,120,94,84,102,106,79,84,102,...,134,104,88,121,128,100,84,107,113,87
1,84,102,106,79,84,102,102,83,80,102,...,128,100,84,107,113,87,84,99,104,79
2,84,102,102,83,80,102,102,79,84,94,...,113,87,84,99,104,79,84,99,104,79
3,80,102,102,79,84,94,102,79,80,94,...,104,79,84,99,104,79,84,103,104,79
4,84,94,102,79,80,94,98,76,80,102,...,104,79,84,103,104,79,79,107,109,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6430,60,83,96,85,64,87,100,88,64,83,...,104,92,66,87,108,89,63,83,104,85
6431,64,79,100,85,56,71,96,85,56,68,...,100,85,66,83,100,85,63,83,100,81
6432,56,68,91,81,56,64,91,81,53,64,...,100,81,59,87,96,81,63,83,92,74
6433,56,68,87,74,60,71,91,81,60,64,...,96,74,59,83,92,74,59,83,92,70


isolet

In [233]:
isolet = openml.datasets.get_dataset(300)
X, y, categorical_indicator, attribute_names = isolet.get_data(target=isolet.default_target_attribute)

In [234]:
X

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f608,f609,f610,f611,f612,f613,f614,f615,f616,f617
0,-0.4394,-0.0930,0.1718,0.4620,0.6226,0.4704,0.3578,0.0478,-0.1184,-0.2310,...,0.3334,0.4102,0.2052,0.3846,0.3590,0.5898,0.3334,0.6410,0.5898,-0.4872
1,-0.4348,-0.1198,0.2474,0.4036,0.5026,0.6328,0.4948,0.0338,-0.0520,-0.1302,...,0.2272,0.0000,0.2954,0.2046,0.4772,0.0454,0.2046,0.4318,0.4546,-0.0910
2,-0.2330,0.2124,0.5014,0.5222,-0.3422,-0.5840,-0.7168,-0.6342,-0.8614,-0.8318,...,0.0952,-0.1112,-0.0476,-0.1746,0.0318,-0.0476,0.1112,0.2540,0.1588,-0.4762
3,-0.3808,-0.0096,0.2602,0.2554,-0.4290,-0.6746,-0.6868,-0.6650,-0.8410,-0.9614,...,0.0648,-0.0504,-0.0360,-0.1224,0.1366,0.2950,0.0792,-0.0072,0.0936,-0.1510
4,-0.3412,0.0946,0.6082,0.6216,-0.1622,-0.3784,-0.4324,-0.4358,-0.4966,-0.5406,...,0.2812,0.1562,0.3124,0.2500,-0.0938,0.1562,0.3124,0.3124,0.2188,-0.2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7792,-0.6842,-0.3280,-0.1984,0.2956,0.8786,0.8948,0.3118,0.1822,0.1012,0.1740,...,0.8928,0.7738,0.7738,0.7142,0.6428,0.5952,0.5714,0.3928,0.4286,0.2858
7793,-0.5912,-0.2420,0.8174,1.0000,0.4642,0.6428,0.6944,0.3056,-0.3888,-0.6826,...,0.0192,0.1924,-0.1154,0.0192,0.2116,-0.0384,0.0192,-0.2308,-0.4230,-0.7116
7794,-0.6696,-0.3730,0.1584,0.8910,1.0000,0.9762,0.9762,0.7684,0.4106,0.0154,...,0.0000,0.0910,0.1818,0.2000,0.1454,0.0182,-0.2910,0.0728,0.0728,-0.5818
7795,-0.5764,-0.1764,0.5106,0.3742,-0.1670,-0.5858,-0.7882,-0.7224,-0.6330,-0.8212,...,0.3044,0.4130,0.5870,0.4348,0.5652,0.3478,-0.0434,0.3044,-0.0434,-0.5000


nursery

In [235]:
nursery = openml.datasets.get_dataset(26)
X, y, categorical_indicator, attribute_names = nursery.get_data(target=nursery.default_target_attribute)

In [236]:
X

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended
1,usual,proper,complete,1,convenient,convenient,nonprob,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority
...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority


gas-drift-different-concentrations

In [237]:
gasdrift = openml.datasets.get_dataset(1477)
X, y, categorical_indicator, attribute_names = gasdrift.get_data(target=gasdrift.default_target_attribute)

In [238]:
X

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129
0,12285.6582,4.076635,4.842317,7.509393,10.822436,-1.312657,-1.853717,-6.924985,11800.9233,4.483500,...,-3.440387,1784.5324,1.907000,1.729200,4.881194,8.623828,-0.314110,-0.661556,-3.521663,10.0
1,-35.6889,0.993944,0.166099,0.489363,3.484663,-0.130298,-0.528364,-3.735347,266.4145,1.053988,...,-4.482534,904.9898,1.433707,1.068069,2.532958,5.369720,-0.183779,-0.534087,-4.635975,50.0
2,63927.2217,14.956941,19.971376,29.188512,33.291320,-10.433776,-16.062245,-49.490143,57405.8483,15.613843,...,-25.150090,14585.7879,8.189021,6.099452,12.127991,15.709651,-3.887082,-6.731473,-19.326895,250.0
3,2992.9019,1.380553,0.808910,1.288259,4.660135,-0.755903,-1.120470,-4.075213,4301.4033,1.652701,...,-7.215792,6044.5554,3.488295,2.662288,5.938297,8.544508,-1.567322,-2.701235,-6.472439,600.0
4,57524.7812,11.912566,14.631496,19.809240,23.715868,-9.084750,-11.770585,-39.234003,50051.0703,11.732548,...,-10.248794,10580.1006,5.752675,3.880740,8.545897,11.831716,-2.655521,-4.312744,-8.510591,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,13384.8262,2.820931,4.007378,6.618008,11.386095,-2.142994,-3.110327,-11.296786,19013.4575,4.050907,...,-8.012662,7548.3501,2.377836,3.400734,8.571174,11.555882,-1.617656,-2.457614,-6.226359,10.0
13906,13382.9619,2.825174,4.010915,6.483989,10.356127,-2.156512,-3.179563,-10.184803,19034.2495,4.066463,...,-6.943002,7510.4946,2.364505,3.401381,8.512949,12.149638,-1.613554,-2.493870,-6.859804,10.0
13907,13336.8725,2.822288,3.980818,6.487103,10.936979,-2.146688,-3.273109,-11.067489,18997.7222,4.055524,...,-7.777268,7530.0010,2.369898,3.400592,8.494436,11.839013,-1.612525,-2.504918,-6.263872,10.0
13908,13351.1318,2.824358,3.987819,6.554427,11.331002,-2.143651,-3.257854,-11.795109,19035.9926,4.071607,...,-6.890286,7599.0201,2.391834,3.358804,8.457260,11.297346,-1.606879,-2.438701,-6.044784,10.0


MagicTelescope

In [243]:
magictelescope = openml.datasets.get_dataset(1120)
X, y, categorical_indicator, attribute_names = magictelescope.get_data(target=magictelescope.default_target_attribute)

In [244]:
X

Unnamed: 0,fLength:,fWidth:,fSize:,fConc:,fConc1:,fAsym:,fM3Long:,fM3Trans:,fAlpha:,fDist:
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620
...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166


In [245]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)

In [None]:
run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_MagicTelescope.csv', index=False) 
results_df

Letter

In [None]:
letter = openml.datasets.get_dataset(6)
X, y, categorical_indicator, attribute_names = letter.get_data(target=letter.default_target_attribute)

In [220]:
X = scaler.fit_transform(X)
y = le.fit_transform(y)

run_parallel(algorithms)

# Run all the algorithms in parallel and save the results to a CSV file
results = run_parallel(algorithms)
results_df = pd.DataFrame(results)
results_df.to_csv('clustering_results_MagicTelescope.csv', index=False) 
results_df

0        Z
1        P
2        S
3        H
4        H
        ..
19995    C
19996    Z
19997    O
19998    L
19999    Q
Name: class, Length: 20000, dtype: category
Categories (26, object): ['A' < 'B' < 'C' < 'D' ... 'W' < 'X' < 'Y' < 'Z']

Covertype

In [222]:
covertype = openml.datasets.get_dataset(150)
X, y, categorical_indicator, attribute_names = covertype.get_data(target=covertype.default_target_attribute)

In [224]:
y

0         5
1         5
2         2
3         2
4         5
         ..
581007    3
581008    3
581009    3
581010    3
581011    3
Name: class, Length: 581012, dtype: category
Categories (7, object): ['1' < '2' < '3' < '4' < '5' < '6' < '7']