In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import json
import pandas as pd
import autokeras as ak

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from typing_extensions import override

"""
The datasets folder demands that this structures is followed:
datasets/
   <FILE_NAME>.csv # where the main tabular data is stored
   /<ALGORITHM_NAME_1>/ # where the algorithm specific data is stored
       key.csv # where the key for the images are stored
       <IMAGE_NAME>.json # the images are stored as an array
   /<ALGORITHM_NAME_2>/ # where the algorithm specific data is stored
       ...

The pipeline will then run the <FILE_NAME>.csv file through a number of traditional models
and algorithms. While the converted images are run through a autokeras CNN pipeline.

All the results are then saved in the
results/<DATASET_NAME>/<EXPERIMENT_NAME>...
The following are stored
- results.csv # contains the metrics for all the algorithms tested (final results)
- <ALGORITHM_NAME>.pkl # contains the trained model for the algorithm
"""
DATASETS = [
    {
        "path": "datasets/sample",
        "filename": "Simple_Classification_Dataset.csv",
        "target_column": "label"
    }
]

TRADITIONAL_MODELS = {
    "RF": {
        "class": RandomForestClassifier,
        "params": {
            "n_estimators": 100,
            "verbose": 0,
        }
    },
    "SVM": {
        "class": SVC,
        "params": {
            "kernel": "rbf",
            "C": 1.0,
            "gamma": 'scale',
            "verbose": 0,
        }
    },
}

AUTOKERAS_CONFIGS = {
    "max_trials": 10,  # Number of different models to try
    "epochs": 10,  # Number of epochs to train each model
    "tuner": "bayesian",  # Type of tuner to use
    "objective": "val_accuracy",  # Objective to optimize
    "seed": 42,  # Random seed for reproducibility
}

In [None]:
from libs.autokeras_classification import autokeras_cnn_pipeline
from libs.traditional_classification_pipeline import tabular_testing
import os
from datetime import datetime

for dataset in DATASETS:
    # for a given dataset create a new experiment folder
    output_folder = f"results/{dataset['path'].split('/')[-1]}/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
    os.makedirs(output_folder, exist_ok=True)

    # try to run the traditional models first
    results = tabular_testing(file_name=os.path.join(dataset["path"], dataset["filename"]), target_column=dataset["target_column"], model_configs=TRADITIONAL_MODELS)
    results.to_csv(f"{output_folder}/results.csv", index=False)
    # get all the "directories" in the dataset folder
    results = autokeras_cnn_pipeline(dataset["path"],
                                     output_folder=output_folder,
                                     all_result_df=results,
                                     all_result_df_path=f"{output_folder}/results.csv",
                                     **AUTOKERAS_CONFIGS
                                     )

    # save the results
    #results = tab_result
    results.to_csv(f"{output_folder}/results.csv", index=False)
print("DONE!!!")