In [1]:
"""Run this notebook from `<repository_root>/notebooks`"""

'Run this notebook from `<repository_root>/notebooks`'

In [1]:
import os
os.getcwd()
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import pickle
import shutil
from typing import List
import os
import openml
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from meta_automl.data_preparation.models_loaders import KnowledgeBaseModelsLoader
from surrogate.datasets import HomogeneousPipelineDataset

In [2]:
KNOWLEDGE_BASE_DIRECTORY_PATH = "../data/knowledge_base_0"
TRAIN_TEST_SPLIT_PATH = "../data/knowledge_base_0/train_test_datasets_classification.csv"

DATASET_DIRNAME = "openml_meta_features_and_fedot_pipelines"

In [3]:
EXCLUDE_KNOWLEDGE_BASE_DATASETS = [
    "connect-4",
    "higgs",
]

In [4]:
FITNESS_METRIC = "f1"

In [5]:
TASK_TYPE = "classification"

In [6]:
# Make dataset directory

if not os.path.exists(os.path.join("../data", DATASET_DIRNAME)):
    os.mkdir(os.path.join("../data", DATASET_DIRNAME))

In [11]:
def get_df_meta_features(df_datasets: pd.DataFrame):
    df_meta_features = openml.datasets.list_datasets(
        df_datasets["dataset_id"], 
        output_format="dataframe",
    )
    df_meta_features.reset_index(drop=True, inplace=True)
    df_meta_features.fillna(-1, inplace=True)
    return df_meta_features

def get_df_meta_features_fitted_scaller(knowledge_base_path: str, scaler_class, split:str, task_type: str, data_columns):

    models_loader = KnowledgeBaseModelsLoader(knowledge_base_path)
    df_datasets = models_loader.parse_datasets(split, task_type)
    df_meta_features = get_df_meta_features(df_datasets)
    
    scaler = scaler_class()
    scaler.fit(df_meta_features[data_columns])

    df_meta_features[data_columns] = scaler.transform(df_meta_features[data_columns])
    return scaler

def get_preprocessed_meta_features(df_datasets: pd.DataFrame, scaler, data_columns: List[str]):
    df_meta_features = get_df_meta_features(df_datasets)
    df_meta_features[data_columns] = scaler.transform(df_meta_features[data_columns])
    return df_meta_features

def get_dataset(
        knowledge_base_path: str, 
        scaler, 
        split: str, 
        dataset_dir_path: str,
        meta_features_data_columns: List[str],
        fitness_metric: str = "f1", 
        task_type: str = "classification",
        exclude_datasets_names: List["str"] = [],
    ):
    """
    split: `train`/`test`.
    task_type: `classification`, `regression`, `ts_forecasting`.
    """
    if task_type != "classification":
        raise NotImplementedError(f"Check if this implementation is suitable for {task_type}")
    if fitness_metric == "log_loss":
        fitness_coef = 1
    else:
        fitness_coef = -1
    
    if not os.path.exists(os.path.join("../data", dataset_dir_path, split)):
        os.mkdir(os.path.join("../data", dataset_dir_path, split))

    temp_pipelines_dir = "./temp_pipelines_dir"
    # if os.path.exists(temp_pipelines_dir):
    #     shutil.rmtree(temp_pipelines_dir)
    
    models_loader = KnowledgeBaseModelsLoader(knowledge_base_path)
    
    df_datasets = models_loader.parse_datasets(split, task_type)

    df_meta_features = get_preprocessed_meta_features(df_datasets, scaler, meta_features_data_columns)
    df_meta_features.to_csv(
        os.path.join("../data", dataset_dir_path, split, "meta_features.csv"), 
        columns=meta_features_data_columns,
        header=False,
        index=False)

    pipeline_id = 0
    records = []
    for _, dataset in df_datasets.iterrows():
        if dataset["dataset_name"] in exclude_datasets_names:
            continue

        if len(df_meta_features[df_meta_features["did"] == dataset["dataset_id"]]) > 1:
            raise ValueError(f"Dataset id is not unique: {dataset['dataset_id']}")
        task_id = df_meta_features[df_meta_features["did"] == dataset["dataset_id"]].index[0]

        dataset_name = dataset["dataset_name"]
        
        dataset_models = models_loader.load(
            dataset_names=[dataset_name],  # load models just for this exact dataset.
            fitness_metric=fitness_metric,  # must correspond to a metric name in a knowledge base.
        )
        data = [(str(x.predictor), fitness_coef * x.fitness.value) for x in dataset_models]

        temp_df = pd.DataFrame(data=data, columns=["predictor", "fitness"])
        best_unique_pipelines_indexes = temp_df.groupby('predictor')['fitness'].idxmax().to_list()
        
        for index in best_unique_pipelines_indexes:
            model = dataset_models[index]
            y = fitness_coef * model.fitness.value

            model.predictor.save(os.path.join(temp_pipelines_dir, f"pipeline_{pipeline_id}.json"))

            records.append({"task_id": task_id, "pipeline_id":pipeline_id, "y": y})
            
            pipeline_id += 1

    task_pipe_comb_df = pd.DataFrame.from_records(records)
    task_pipe_comb_df.to_csv(
        os.path.join("../data", dataset_dir_path, split, "task_pipe_comb_df.csv"), 
        header=True,
        index=True,
    )

    dataset = HomogeneousPipelineDataset(
        root=temp_pipelines_dir,
        direction="directed",
        use_operations_hyperparameters=False,
        overriden_processed_dir=None #overriden_processed_dir="./processed"
    )
    return dataset
#     shutil.rmtree(temp_pipelines_dir)

In [12]:
meta_features_data_columns = [
    "MajorityClassSize",
    "MaxNominalAttDistinctValues",
    "MinorityClassSize",
    "NumberOfClasses",
    "NumberOfFeatures",
    "NumberOfInstances",
    "NumberOfInstancesWithMissingValues",
    "NumberOfMissingValues",
    "NumberOfNumericFeatures",
    "NumberOfSymbolicFeatures"
]

In [13]:
meta_features_scaller = get_df_meta_features_fitted_scaller(
    KNOWLEDGE_BASE_DIRECTORY_PATH, 
    StandardScaler, 
    split="all",
    task_type=TASK_TYPE,
    data_columns=meta_features_data_columns,
)

dset = get_dataset(
    KNOWLEDGE_BASE_DIRECTORY_PATH,
    scaler=meta_features_scaller,
    split="train",
    dataset_dir_path=DATASET_DIRNAME,
    meta_features_data_columns=meta_features_data_columns,
    fitness_metric=FITNESS_METRIC,
    task_type=TASK_TYPE,
    exclude_datasets_names=EXCLUDE_KNOWLEDGE_BASE_DATASETS,
)