In [None]:
# 118 datasets for the meta-training
# 27 datasets for the validation

# update poac -> package it
# install poac in requirements
# copy and paste tpe-autoclust
# run experiments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import pandas as pd
from scipy.io import arff

def is_classification_dataset(df):
    """Check if a DataFrame has a column indicating classification."""
    class_variations = ["CLASS", "class", "Class"]
    return any(col in df.columns for col in class_variations)

def convert_arff_to_csv(input_dir, output_dir):
    """Convert ARFF files to CSV for classification datasets."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".arff"):
            input_path = os.path.join(input_dir, file_name)
            output_path = os.path.join(output_dir, file_name.replace(".arff", ".csv"))

            try:
                data, meta = arff.loadarff(input_path)
                df = pd.DataFrame(data)

                # Check if it's a classification dataset
                if is_classification_dataset(df):
                    # Decode byte strings if necessary
                    for col in df.select_dtypes(["object"]):
                        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
                    
                    df.to_csv(output_path, index=False)
                    print(f"Converted: {file_name} -> {output_path}")
                else:
                    print(f"Skipped (not classification): {file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Directories for input and output
input_directory = "datasets/validation/synthetic/"
output_directory = "datasets/validation_csv"

# Run the conversion
convert_arff_to_csv(input_directory, output_directory)


Converted: sizes2.arff -> datasets/validation_csv/sizes2.csv
Converted: 3-spiral.arff -> datasets/validation_csv/3-spiral.csv
Converted: aggregation.arff -> datasets/validation_csv/aggregation.csv
Converted: compound.arff -> datasets/validation_csv/compound.csv
Converted: cluto-t8-8k.arff -> datasets/validation_csv/cluto-t8-8k.csv
Converted: pathbased.arff -> datasets/validation_csv/pathbased.csv
Converted: sizes4.arff -> datasets/validation_csv/sizes4.csv
Converted: cluto-t7-10k.arff -> datasets/validation_csv/cluto-t7-10k.csv
Converted: R15.arff -> datasets/validation_csv/R15.csv
Converted: jain.arff -> datasets/validation_csv/jain.csv


---

## TPOT - SIL


In [None]:
import os
import pandas as pd
from tpot import TPOTClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import numpy as np
import time

# Configuration
validation_folder = "validation_csv"
results_file = "tpot_clustering_results.csv"
population_size = 50
verbosity = 2
random_state = 42

# Function to get already processed datasets
def get_processed_datasets(results_file):
    if os.path.exists(results_file):
        df = pd.read_csv(results_file)
        return set(df['Dataset'])
    return set()

# Initialize the results file
if not os.path.exists(results_file):
    pd.DataFrame(columns=["Dataset", "Best_Pipeline", "Silhouette_Score", "Running_Time(s)"]).to_csv(results_file, index=False)

processed_datasets = get_processed_datasets(results_file)

# Iterate over datasets in the validation folder
for dataset_name in os.listdir(validation_folder):
    if not dataset_name.endswith(".csv") or dataset_name in processed_datasets:
        print(f"Skipping {dataset_name}, already processed.")
        continue

    try:
        print(f"\nProcessing dataset: {dataset_name}")
        start_time = time.time()

        # Load and preprocess dataset
        df = pd.read_csv(os.path.join(validation_folder, dataset_name))
        X = df.iloc[:, :-1]  # All columns except the last

        # Scale numeric data
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X.select_dtypes(include=['number']))

        # Initialize and run TPOT optimization
        tpot_clustering = TPOTClustering(
            population_size=population_size,
            verbosity=verbosity,
            random_state=random_state,
            crossover_rate=0.05,
            mutation_rate=0.9,
            max_time_mins=10,
            max_eval_time_mins=1.0,
            scoring="silhouette_score"
        )

        tpot_clustering.fit(X_scaled)
        labels = tpot_clustering.predict(X_scaled)
        silhouette = silhouette_score(X_scaled, labels)

        end_time = time.time()
        running_time = round(end_time - start_time, 2)

        # Save results
        results = pd.DataFrame({
            "Dataset": [dataset_name],
            "Best_Pipeline": [tpot_clustering.fitted_pipeline_],
            "Silhouette_Score": [silhouette],
            "Running_Time(s)": [running_time]
        })

        results.to_csv(results_file, mode="a", header=False, index=False)
        print(f"Completed {dataset_name} with Silhouette Score: {silhouette}")

    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")
        error_results = pd.DataFrame({
            "Dataset": [dataset_name],
            "Best_Pipeline": ["ERROR"],
            "Silhouette_Score": ["ERROR"],
            "Running_Time(s)": [0]
        })
        error_results.to_csv(results_file, mode="a", header=False, index=False)


---

## TPOT - DBS


---

## TPOT - CHS


---

## TPOT - Ensemble


---

## TPOT - Surrogate
