In [3]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def classification_perf2vec(clf, clf_name, modAlgo, log_mode):
    dims = [5, 30]
    modules = ['elitist', 'mirrored', 'base_sampler', 'weights_option', 'local_restart', 'step_size_adaptation'] if modAlgo == 'modCMA' else ['mutation_base','mutation_reference','mutation_n_comps','use_archive','crossover','adaptation_method','lpsr']
    df = pd.read_csv(f'./data/classification_data/{modAlgo}_conf_perf2vec_{log_mode}.csv', index_col = 0)
    df_grid = pd.read_csv(f'./data/raw_data/{modAlgo}_conf_grid.csv', index_col=0)

    columns = ['module', 'dim', 'budget', 'acc', 'f1', 'acc_d', 'f1_d']
    data = []
    for dim in dims:
        budgets = [50*dim, 100*dim, 300*dim, 500*dim, 1000*dim, 1500*dim]
        for budget in budgets:
            variable = f'.*_{budget}_{dim}'           
            df_sub = df[df.index.str.contains(fr'\b{variable}\b', regex=True)]
            df_sub.index = [ int(i.split("_")[0]) for i in df_sub.index]
            df_sub = df_sub.join(df_grid)
            df_sub = df_sub.replace({np.nan: 'N'})


            y = df_sub.iloc[:,-len(modules):] 
            X = df_sub.drop(modules, axis=1)
  

            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            # clf = RandomForestClassifier(random_state=42)
            dummy = DummyClassifier(strategy='most_frequent')

            accuracy_scores = [[] for _ in range(len(modules))]
            f1_scores = [[] for _ in range(len(modules))]
            accuracy_scores_dummy = [[] for _ in range(len(modules))]
            f1_scores_dummy = [[] for _ in range(len(modules))]
            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                y_train = y_train.replace({True: "true", False: "false"})
                y_test = y_test.replace({True: "true", False: "false"})
                y_train = y_train.replace({1: "one", 2: "two"})
                y_test = y_test.replace({1: "one", 2: "two"})
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test).tolist()

                for N in range(0, len(modules)):
                    y_pred_module = [row[N] for row in y_pred]
                    y_test_module = list(y_test[modules[N]])

                    acc = accuracy_score(y_test_module, y_pred_module)
                    f1 = f1_score(y_test_module, y_pred_module, average = 'macro')
                    accuracy_scores[N].append(acc)
                    f1_scores[N].append(f1)

                dummy.fit(X_train, y_train)
                y_pred_dummy = dummy.predict(X_test).tolist()
                for N in range(0, len(modules)):
                    y_pred_dummy_module = [row[N] for row in y_pred_dummy]
                    y_test_module = list(y_test[modules[N]])

                    acc_dummy = accuracy_score(y_test_module, y_pred_dummy_module)
                    f1_score_dummy = f1_score(y_test_module, y_pred_dummy_module, average = 'macro')
                    accuracy_scores_dummy[N].append(acc_dummy)
                    f1_scores_dummy[N].append(f1_score_dummy)



            for N in range(0, len(modules)):
                acc = np.mean(accuracy_scores[N])
                f1 = np.mean(f1_scores[N])
                acc_d = np.mean(accuracy_scores_dummy[N])
                f1_d = np.mean(f1_scores_dummy[N])

                data.append([modules[N], dim, budget, acc, f1, acc_d, f1_d])
                print(modules[N])
                # print("dim_"+str(dim)+"    budget_"+str(budget))
                # print("acc = "+str(acc))
                # print("acc_dummy = "+str(acc_d))
                # print("f1 = "+str(f1))
                # print("f1_dummy = "+str(f1_d))
    df_class = pd.DataFrame(data=data, columns=columns)
    df_class.to_csv(f'./results/{modAlgo}/classification_p2v_{clf_name}_MLC.csv')




clf = RandomForestClassifier(random_state=42)
classification_perf2vec(clf, 'RF', 'modCMA', 'log')
classification_perf2vec(clf, 'RF', 'modDE', 'log')


elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
elitist
mirrored
base_sampler
weights_option
local_restart
step_size_adaptation
mutation_base
mutation_reference
mutatio