In [3]:
import numpy as np
import pandas as pd
import os

# Load datasets (assuming each CSV contains a single column of cluster labels)
cv_names = ['sil', 'dbs', 'chs']
dataset_names = ['ecoli_labels',
 'pathbased_labels',
 'thy_labels',
 'iris_labels',
 'jain_labels',
 'arrhythmia_labels',
 'compound_labels',
 'iono_labels',
 'sizes4_labels',
 '3-spiral_labels',
 'cluto-t7-10k_labels',
 'sonar_labels',
 'glass_labels',
 'tae_labels',
 'segment_labels',
 'sizes2_labels',
 'balance-scale_labels',
 'cassini_labels',
 'elliptical_10_2_labels',
 'engytime_labels',
 'flame_labels',
 'fourty_labels',
 'twodiamonds_labels',
 'wine_labels',
 'disk-6000n_labels']

filepaths = [f'../results/{cvi}/labels/{dataset}.csv' for dataset in dataset_names for cvi in cv_names]


In [5]:
import numpy as np
import pandas as pd
import ClusterEnsembles as CE

# Process each dataset and generate the final clustering result
for dataset_name in dataset_names:
    # Get the file paths for the current dataset (3 file paths for each CV method)
    dataset_filepaths = filepaths[dataset_names.index(dataset_name)*len(cv_names):(dataset_names.index(dataset_name)+1)*len(cv_names)]

    # Load the three label files for the current dataset
    datasets = [pd.read_csv(f).iloc[:, 0].values for f in dataset_filepaths]
    datasets = np.array(datasets)
    final_labels = CE.cluster_ensembles(datasets)
    final_filename = f'final_ensemble_clustering_{dataset_name}.csv'
    pd.DataFrame(final_labels, columns=['Cluster']).to_csv(f"results/ensemble/labels/{final_filename}", index=False, header=False)


In [6]:
import os
import pandas as pd
from tpot import TPOTClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import time

# Configuration
scoring_metric = "ensemble"
validation_folder = "/home/camilo/dev/ijcnn_25/datasets/validation_csv"
results_file = f"results/{scoring_metric}/tpe_autoclust_results.csv"
output_folder = f"results/{scoring_metric}"

# Ensure directories exist
os.makedirs(output_folder, exist_ok=True)

def plot_pca_comparison(X, y, labels, ari_score, save_path):
    """Generate and save PCA plots comparing original labels and cluster labels."""
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k')
    axes[0].set_title('PCA with Original Labels')
    axes[0].legend(*scatter1.legend_elements(), title="Classes")

    scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='plasma', edgecolor='k')
    axes[1].set_title(f'PCA with Cluster Labels\nARI: {ari_score:.2f}')
    axes[1].legend(*scatter2.legend_elements(), title="Clusters")

    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close(fig)

def get_processed_datasets(results_file):
    """Load processed datasets from results file."""
    if os.path.exists(results_file):
        df = pd.read_csv(results_file)
        return set(df['Dataset'])
    return set()

# Initialize the results file if it doesn't exist
if not os.path.exists(results_file):
    pd.DataFrame(columns=["Dataset", "silhouette_score", 
                          "davies_bouldin_score", "calinski_harabasz_score", 
                          "adjusted_rand_score", "Running_Time(s)"]
                ).to_csv(results_file, index=False)

# Get the list of already processed datasets
processed_datasets = get_processed_datasets(results_file)

# Iterate over datasets
for dataset_name in os.listdir(validation_folder):
    if not dataset_name.endswith(".csv") or dataset_name in processed_datasets:
        print(f"Skipping {dataset_name}, already processed.")
        continue

    try:
        print(f"\nProcessing dataset: {dataset_name}")
        start_time = time.time()

        # Load and preprocess data
        df = pd.read_csv(os.path.join(validation_folder, dataset_name))
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        # Convert categorical labels to numeric if necessary
        if y.dtype == 'object' or y.dtype.name == 'category':
            y = y.astype('category').cat.codes.to_numpy()
        else:
            y = y.to_numpy()

        # Scale numeric features
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X.select_dtypes(include=['number']))
        
        labels = pd.read_csv(f"results/ensemble/labels/final_ensemble_clustering_{dataset_name.replace('.csv','')}_labels.csv", header=None)
        labels = labels.values.ravel()

        # Calculate metrics
        sil = silhouette_score(X_scaled, labels)
        dbs = davies_bouldin_score(X_scaled, labels)
        chs = calinski_harabasz_score(X_scaled, labels)
        ari = adjusted_rand_score(y, labels)
        running_time = round(time.time() - start_time, 2)

        # Save results
        results = pd.DataFrame({
            "Dataset": [dataset_name],
            "silhouette_score": [sil],
            "davies_bouldin_score": [dbs],
            "calinski_harabasz_score": [chs],
            "adjusted_rand_score": [ari],
            "Running_Time(s)": [running_time]
        })

        results.to_csv(results_file, mode="a", header=False, index=False)

        # Save PCA plot
        plot_pca_comparison(X_scaled, y, labels, ari, f"{output_folder}/{dataset_name.replace('.csv', '_pca.png')}")
        print(f"Completed {dataset_name} with ARI: {ari}")

    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")
        error_results = pd.DataFrame({
            "Dataset": [dataset_name],
            "silhouette_score": ["ERROR"],
            "davies_bouldin_score": ["ERROR"],
            "calinski_harabasz_score": ["ERROR"],
            "adjusted_rand_score": ["ERROR"],
            "Running_Time(s)": [0]
        })
        error_results.to_csv(results_file, mode="a", header=False, index=False)



Processing dataset: cluto-t7-10k.csv
Completed cluto-t7-10k.csv with ARI: 0.222871433459034

Processing dataset: segment.csv
Completed segment.csv with ARI: 0.355381067671688

Processing dataset: twodiamonds.csv
Completed twodiamonds.csv with ARI: 0.08792079850107448

Processing dataset: fourty.csv
Completed fourty.csv with ARI: 0.23588052650089633

Processing dataset: disk-6000n.csv
Completed disk-6000n.csv with ARI: 0.07761973232853507

Processing dataset: wine.csv
Completed wine.csv with ARI: 0.08883855949054086

Processing dataset: balance-scale.csv
Completed balance-scale.csv with ARI: 0.07490040140879939

Processing dataset: tae.csv
Completed tae.csv with ARI: 0.017493618430659854

Processing dataset: cassini.csv
Completed cassini.csv with ARI: 0.6395557917623784

Processing dataset: pathbased.csv
Completed pathbased.csv with ARI: 0.11844729302892097

Processing dataset: iono.csv
Completed iono.csv with ARI: 0.02387477921919727

Processing dataset: flame.csv
Completed flame.csv 

---

In [None]:
import numpy as np

import ClusterEnsembles as CE

label1 = np.array([1, 1, 1, 2, 2, 3, 3])

label2 = np.array([2, 2, 2, 3, 3, 1, 1])

label3 = np.array([4, 4, 2, 2, 3, 3, 3])

label4 = np.array([1, 2, np.nan, 1, 2, np.nan, np.nan]) # `np.nan`: missing value

labels = np.array([label1, label2, label3, label4])

# label_ce = CE.cluster_ensembles(labels)

# print(label_ce)

[[ 1.  1.  1.  2.  2.  3.  3.]
 [ 2.  2.  2.  3.  3.  1.  1.]
 [ 4.  4.  2.  2.  3.  3.  3.]
 [ 1.  2. nan  1.  2. nan nan]]


In [None]:
import numpy as np
import random

def get_highest_confidence_level(material_count, materials):
    """
    Returns the material class with the highest confidence level
    based on the material count and the materials list.
    """
    total_instances = sum(material_count)
    max_count = max(material_count)
    confidence_level = max_count / total_instances if total_instances > 0 else 0
    material_class = materials[material_count.index(max_count)]
    return confidence_level, material_class

def confidence_level_not_unique(material_count):
    """
    Checks if the confidence level is not unique.
    """
    max_count = max(material_count)
    return material_count.count(max_count) > 1

def majority_voting(labels, target_confidence_level=0.8, maximum_instances=9):
    """
    This is the majority voting algorithm to determine the material class.
    Takes labels from multiple classifiers (datasets) as input.
    """
    materials = [1, 2, 3, 4]
    material_count = [0] * len(materials)
    number_of_instances = len(labels[0])  # Assuming labels have consistent length
    confidence_level = 0

    # Voting procedure
    for i in range(number_of_instances):
        # Prepare a list of valid votes (ignoring np.nan)
        votes = []
        for label_set in labels:
            if not np.isnan(label_set[i]):  # Ignore np.nan values
                votes.append(label_set[i])
        
        if votes:
            # Get the classification (most frequent vote)
            material_class = np.bincount(votes).argmax()  # Majority vote
            
            # Increment the count for the selected material class
            material_count[materials.index(material_class)] += 1

            # Get the highest confidence level based on current material counts
            confidence_level, material_class = get_highest_confidence_level(material_count, materials)

            if confidence_level >= target_confidence_level:
                if confidence_level_not_unique(material_count):
                    # If confidence is not unique, pick a contender randomly
                    material_class = random.choice(materials)
                    break
                else:
                    break
    
    return material_class

# Test labels
label1 = np.array([1, 1, 1, 2, 2, 3, 3])
label2 = np.array([2, 2, 2, 3, 3, 1, 1])
label3 = np.array([4, 4, 2, 2, 3, 3, 3])
label4 = np.array([1, 2, np.nan, 1, 2, np.nan, np.nan])  # `np.nan`: missing value

# Combine the labels
labels = np.array([label1, label2, label3, label4])

# Apply majority voting to the provided labels
final_class = majority_voting(labels)
print(f"Final material class based on majority voting: {final_class}")


Final material class based on majority voting: 1
