In [1]:
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
import pandas as pd
import os

from libs.find_optimal_size import find_optimal_rectangle

from algorithms.gramMatrix.main import GramMatrixPipeline
from algorithms.deepinsight.main import DeepInsightPipeline
from algorithms.correlationBased.main import CorrelationPixelMappingPipeline
from algorithms.igtd.main import IGTDPipeline
from algorithms.random.main import RandomStackPipeline
from algorithms.refined.main import REFINEDPipeline
from algorithms.som.main import SOMFeatureMappingPipeline
from algorithms.supertml.main import SuperTMLPipeline
"""
All the algorithms assume that all features are numerical and continuous.
To use this properly please...
1. Make sure there are not NaN values in the dataset.
2. Make sure with the exception of the target column, all columns are "features" and will be used for training.

To use this script please do the following:
NOTE! PLEASE MAKE SURE THE DATASET FIRST COLUMN IS THE INDEX AND NOT A FEATURE!
[
    // Each dictionary is a single dataset
    {
        "path": "path/to",
        "filename": "filename.csv",
        "target_column": "target_column_name"
    }
]
"""
DATASETS = [
    {
        "path": "datasets/sample",
        "filename": "Simple_Classification_Dataset.csv",
        "target_column": "label",
        "index_col": 0
    }
]

NORMALIZE_FUNC = MinMaxScaler() # is this how i should store it?

"""
For all algorithms please have the following method
- fit_transform_save(self, path, x, y)
- set_params(self, params) - so that the output and input sizes can be swapped in and out
"""
ALGORITHM_CONFIGS = {
    "correlationPixel": {
        "class": CorrelationPixelMappingPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    },
    "deepinsight": {
        "class": DeepInsightPipeline,
        "default_params": {
            "feature_extractor": "tsne",
            "discretization": "bin",
            "scaler": "standard",
            "img_format": "rgb",
            "random_state": 42,
            "verbose": False
        }
    },
    "gramMatrix": {
        "class": GramMatrixPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    },
    "igtd": {
        "class": IGTDPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    },
    "random": {
        "class": RandomStackPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    },
    "refined": {
        "class": REFINEDPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    },
    "som": {
        "class": SOMFeatureMappingPipeline,
        "default_params": {
            "som_shape": (20, 20),
            "learning_rate": 0.5,
            "sigma": 1.0,
            "num_iterations": 1000,
            "scaler": "standard",
            "neighborhood_function": "gaussian",
            "topology": "rectangular",
            "activation_distance": "euclidean",
            "img_format": "rgb",
            "random_state": 42,
            "verbose": False
        }
    },
    "supertml": {
        "class": SuperTMLPipeline,
        "default_params": {
            "random_state": 42,
            "verbose": False
        }
    }
}


In [2]:
# iterate each of the datasets and run each of the algorithms
for dataset in tqdm(DATASETS):
    # load the dataset
    path = os.path.join(dataset["path"], dataset["filename"])
    df = pd.read_csv(path, index_col=dataset['index_col'])
    X = df.drop(columns=[dataset["target_column"]])
    y = df[dataset["target_column"]]

    # get the "rectangle" with the closest area to the number of features while still being smaller than the number of features
    feature_num = X.shape[1]
    width, height = find_optimal_rectangle(feature_num)

    # for each of the algorithms run the fit_transform_save method
    for algorithm in ALGORITHM_CONFIGS.keys():
        output_path = os.path.join(dataset["path"], algorithm)
        if not os.path.exists(output_path):
            os.makedirs(output_path)

        # create the algorithm instance
        config = ALGORITHM_CONFIGS[algorithm]["default_params"].copy()
        config["output_size"] = (width, height)
        al_instance = ALGORITHM_CONFIGS[algorithm]["class"](**config)

        print(f"Running {algorithm} on dataset {dataset['filename']} with output size {config['output_size']}")
        # fit transform and save the results
        al_instance.fit_transform_save(output_path, X, y)

  0%|          | 0/1 [00:00<?, ?it/s]

Running correlationPixel on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running deepinsight on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running gramMatrix on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running igtd on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running random on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running refined on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running som on dataset Simple_Classification_Dataset.csv with output size (10, 10)
Running supertml on dataset Simple_Classification_Dataset.csv with output size (10, 10)
