In [None]:
# 118 datasets for the meta-training
# 27 datasets for the validation

# update poac -> package it
# install poac in requirements
# copy and paste tpe-autoclust
# run experiments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import pandas as pd
from scipy.io import arff

def is_classification_dataset(df):
    """Check if a DataFrame has a column indicating classification."""
    class_variations = ["CLASS", "class", "Class"]
    return any(col in df.columns for col in class_variations)

def convert_arff_to_csv(input_dir, output_dir):
    """Convert ARFF files to CSV for classification datasets."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".arff"):
            input_path = os.path.join(input_dir, file_name)
            output_path = os.path.join(output_dir, file_name.replace(".arff", ".csv"))

            try:
                data, meta = arff.loadarff(input_path)
                df = pd.DataFrame(data)

                # Check if it's a classification dataset
                if is_classification_dataset(df):
                    # Decode byte strings if necessary
                    for col in df.select_dtypes(["object"]):
                        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
                    
                    df.to_csv(output_path, index=False)
                    print(f"Converted: {file_name} -> {output_path}")
                else:
                    print(f"Skipped (not classification): {file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Directories for input and output
input_directory = "datasets/validation/synthetic/"
output_directory = "datasets/validation_csv"

# Run the conversion
convert_arff_to_csv(input_directory, output_directory)


Converted: sizes2.arff -> datasets/validation_csv/sizes2.csv
Converted: 3-spiral.arff -> datasets/validation_csv/3-spiral.csv
Converted: aggregation.arff -> datasets/validation_csv/aggregation.csv
Converted: compound.arff -> datasets/validation_csv/compound.csv
Converted: cluto-t8-8k.arff -> datasets/validation_csv/cluto-t8-8k.csv
Converted: pathbased.arff -> datasets/validation_csv/pathbased.csv
Converted: sizes4.arff -> datasets/validation_csv/sizes4.csv
Converted: cluto-t7-10k.arff -> datasets/validation_csv/cluto-t7-10k.csv
Converted: R15.arff -> datasets/validation_csv/R15.csv
Converted: jain.arff -> datasets/validation_csv/jain.csv
