In [None]:
!pip install catboost
!pip install scikit-optimize
!pip install openml

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.5.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.5.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.5.0 scikit-optimize-0.10.2
Collecting openml
  Downloading openml-0.15.1-py3-none-

In [None]:
# Блок 1. Импорт библиотек и загрузка датасетов
import openml
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import product
import os

# Для Colab: pip install catboost openml
# !pip install catboost openml

# Список ID regression датасетов OpenML (не слишком большие, чтобы Colab вывез)
dataset_ids = [42362, 3945, 46783, 45017]  # Примерные ID для regression задач

datasets = {}
for did in dataset_ids:
    d = openml.datasets.get_dataset(did)
    X, y, _, _ = d.get_data(target=d.default_target_attribute)
    datasets[did] = (X, y)

# Сетка гиперпараметров CatBoost (пример большого пространства)
param_grid = {
    'iterations': [200, 400, 700, 1000],
    'learning_rate': [0.01, 0.03, 0.1, 0.2],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 10, 15],
    'bagging_temperature': [0, 0.5, 1, 3],
    'border_count': [32, 64, 128, 254],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

# Ограничения совместимости параметров (пример для CatBoost)
def check_params(params):
    if params['grow_policy'] == 'SymmetricTree' and params['depth'] > 8:
        return False
    if params['grow_policy'] in ['Depthwise', 'Lossguide'] and params['depth'] < 6:
        return False
    return True

print("Datasets и сетка параметров готовы.")


  pd.factorize(type_)[0]


Datasets и сетка параметров готовы.


In [None]:
# Блок 2. Реализация обычного DE для гиперпараметров CatBoost

from scipy.optimize import differential_evolution

# Преобразование сетки в список значений для индексации
param_keys = list(param_grid.keys())
param_values = [param_grid[k] for k in param_keys]

def param_from_vector(vec):
    params = {}
    for i, v in enumerate(vec):
        params[param_keys[i]] = param_values[i][int(round(v))]
    return params

bounds = [(0, len(v) - 1) for v in param_values]

def de_objective(vec, X_train, X_val, y_train, y_val):
    params = param_from_vector(vec)
    if not check_params(params):
        return 1e10  # Плохое решение
    model = CatBoostRegressor(**params, task_type="GPU", verbose=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_squared_error(y_val, preds)

def run_de(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    result = differential_evolution(
        de_objective,
        bounds=bounds,
        args=(X_train, X_val, y_train, y_val),
        maxiter=5, popsize=5, polish=False, tol=0.01
    )
    best_params = param_from_vector(result.x)
    best_score = result.fun
    return best_params

print("DE готов к запуску.")

DE готов к запуску.


In [None]:
# Блок 3. Байесовская оптимизация с skopt

from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical

skopt_space = []
for k, vals in param_grid.items():
    if isinstance(vals[0], int):
        skopt_space.append(Integer(min(vals), max(vals), name=k))
    elif isinstance(vals[0], float):
        skopt_space.append(Real(min(vals), max(vals), name=k))
    else:
        skopt_space.append(Categorical(vals, name=k))

def skopt_objective(params, X_train, X_val, y_train, y_val):
    params = {k: v for k, v in zip(param_keys, params)}
    #print(params)
    if not check_params(params):
        return 1e10
    model = CatBoostRegressor(**params, task_type="GPU", verbose=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_squared_error(y_val, preds)

def run_bayesopt(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    result = gp_minimize(
        lambda params: skopt_objective(params, X_train, X_val, y_train, y_val),
        skopt_space,
        n_calls=10,
        random_state=42
    )
    best_params = {k: v for k, v in zip(param_keys, result.x)}
    best_score = result.fun
    return best_params

print("Байесовская оптимизация готова к запуску.")


Байесовская оптимизация готова к запуску.


In [None]:
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import random

def amfeb_classic(
    X, y,
    param_grid,
    fidelity_values=[100, 400, 1000],
    pop_size=5,
    max_iters=5,
    random_state=42
):
    random.seed(random_state)
    np.random.seed(random_state)

    param_keys = list(param_grid.keys())
    param_values = [param_grid[k] for k in param_keys]
    n_params = len(param_keys)
    bounds = [(0, len(v) - 1) for v in param_values]

    def param_from_vector(vec):
        params = {}
        for i, v in enumerate(vec):
            params[param_keys[i]] = param_values[i][int(round(v))]
        return params

    def check_params(params):
        if params['grow_policy'] == 'SymmetricTree' and params['depth'] > 8:
            return False
        if params['grow_policy'] in ['Depthwise', 'Lossguide'] and params['depth'] < 6:
            return False
        return True

    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Initial random population
    population = [np.array([random.randint(0, len(v) - 1) for v in param_values]) for _ in range(pop_size)]
    scores = [np.inf] * pop_size

    # Main optimization loop
    for it in range(max_iters):
        print(f"iteration: {it}")
        # Выбор fidelity (напр., low -> high)
        fid = fidelity_values[min(it // (max_iters // len(fidelity_values)), len(fidelity_values) - 1)]

        for idx, vec in enumerate(population):
            params = param_from_vector(vec)
            params['iterations'] = int(fid)
            if not check_params(params):
                scores[idx] = 1e10
                continue
            model = CatBoostRegressor(**params, task_type="GPU", verbose=0)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            scores[idx] = mean_squared_error(y_val, preds)

        # Эволюционное обновление (дифференциальная эволюция "rand/1")
        for i in range(pop_size):
            idxs = list(range(pop_size))
            idxs.remove(i)
            a, b, c = population[random.choice(idxs)], population[random.choice(idxs)], population[random.choice(idxs)]
            mutant = np.clip(a + 0.5 * (b - c), 0, [len(v) - 1 for v in param_values])
            cross_points = np.random.rand(n_params) < 0.7
            trial = np.where(cross_points, mutant, population[i])
            params_trial = param_from_vector(trial)
            params_trial['iterations'] = int(fid)
            if not check_params(params_trial):
                continue
            model = CatBoostRegressor(**params_trial, task_type="GPU", verbose=0)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            trial_score = mean_squared_error(y_val, preds)
            if trial_score < scores[i]:
                population[i] = trial
                scores[i] = trial_score

    # Return best params
    best_idx = np.argmin(scores)
    best_params = param_from_vector(population[best_idx])
    best_params['iterations'] = int(fidelity_values[-1])
    return best_params


In [None]:
import numpy as np
import random

from catboost import CatBoostRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def amfeb_pca_surrogate(
    X, y,
    param_grid,
    fidelity_values=[100, 400, 1000],
    pop_size=5,
    max_iters=5,
    k=1,
    pca_n_components=20,
    random_state=42
):
    random.seed(random_state)
    np.random.seed(random_state)

    param_keys = list(param_grid.keys())
    param_values = [param_grid[k] for k in param_keys]
    n_params = len(param_keys)
    bounds = [(0, len(v) - 1) for v in param_values]

    def param_from_vector(vec):
        params = {}
        for i, v in enumerate(vec):
            params[param_keys[i]] = param_values[i][int(round(v))]
        return params

    def check_params(params):
        if params['grow_policy'] == 'SymmetricTree' and params['depth'] > 8:
            return False
        if params['grow_policy'] in ['Depthwise', 'Lossguide'] and params['depth'] < 6:
            return False
        return True

    surrogate_X = []
    surrogate_y = []
    surrogate_trained = False

    pca = PCA(n_components=min(pca_n_components, X.shape[1]))
    X_pca = pca.fit_transform(X)
    X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=random_state)

    population = [np.array([random.randint(0, len(v) - 1) for v in param_values]) for _ in range(pop_size)]
    scores = [np.inf] * pop_size

    surrogate = RandomForestRegressor(n_estimators=20, random_state=random_state)

    for it in range(max_iters):
        print(f"iteration: {it}")
        if it % k == 0 and it != 0:
            pca = PCA(n_components=min(pca_n_components, X.shape[1]))
            X_pca = pca.fit_transform(X)
            X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=random_state)
            if surrogate_X:
                surrogate.fit(surrogate_X, surrogate_y)
                surrogate_trained = True

        fid = fidelity_values[min(it // (max_iters // len(fidelity_values)), len(fidelity_values) - 1)]

        for idx, vec in enumerate(population):
            params = param_from_vector(vec)
            params['iterations'] = int(fid)
            if not check_params(params):
                scores[idx] = 1e10
                continue
            model = CatBoostRegressor(**params, task_type="GPU", verbose=0)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            score = mean_squared_error(y_val, preds)
            scores[idx] = score
            surrogate_X.append(list(vec) + [fid])
            surrogate_y.append(score)
            if len(surrogate_X) > 20 and it % k == 0:
                surrogate.fit(surrogate_X, surrogate_y)
                surrogate_trained = True

        for i in range(pop_size):
            idxs = list(range(pop_size))
            idxs.remove(i)
            a, b, c = population[random.choice(idxs)], population[random.choice(idxs)], population[random.choice(idxs)]
            mutant = np.clip(a + 0.5 * (b - c), 0, [len(v) - 1 for v in param_values])
            cross_points = np.random.rand(n_params) < 0.7
            trial = np.where(cross_points, mutant, population[i])
            params_trial = param_from_vector(trial)
            params_trial['iterations'] = int(fid)
            if not check_params(params_trial):
                continue
            if surrogate_trained:
                pred_score = surrogate.predict([list(trial) + [fid]])[0]
            else:
                model = CatBoostRegressor(**params_trial, task_type="GPU", verbose=0)
                model.fit(X_train, y_train)
                preds = model.predict(X_val)
                pred_score = mean_squared_error(y_val, preds)
            if pred_score < scores[i]:
                population[i] = trial
                scores[i] = pred_score
                if surrogate_trained:
                    surrogate_X.append(list(trial) + [fid])
                    surrogate_y.append(pred_score)

    best_idx = np.argmin(scores)
    best_params = param_from_vector(population[best_idx])
    best_params['iterations'] = int(fidelity_values[-1])
    return best_params


In [None]:
import time
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import json

def eval_metrics(y_true, y_pred):
    return [
        mean_absolute_error(y_true, y_pred),
        np.sqrt(mean_squared_error(y_true, y_pred)),
        r2_score(y_true, y_pred)
    ]

# ==== Параметры сетки CatBoost и fidelity ====
param_grid = {
    'learning_rate': [0.01, 0.03, 0.1, 0.2],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 10, 15],
    'bagging_temperature': [0, 0.5, 1, 3],
    'border_count': [32, 64, 128, 254],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}
fidelity_values = [100, 400, 1000]

# ==== Словарь методов ====
methods = {
    #"DE": run_de,                      # Твоя/моя функция DE из предыдущих блоков
    "BayesOpt": run_bayesopt,          # Функция байес. оптимизации
    "AMFEB": lambda X, y: amfeb_classic(X, y, param_grid, fidelity_values=fidelity_values),  # Новая классика
    "ImprovedAMFEB": lambda X, y: amfeb_pca_surrogate(X, y, param_grid, fidelity_values=fidelity_values, k=5)  # Новая твоя
}

# ==== Сравнение всех методов на всех датасетах ====
results = []
metrics = ['MAE', 'RMSE', 'R2', 'Time']
pivot_dfs = []

for did, (X, y) in datasets.items():
    print(f"\n=== Датасет {did} ===")
    for method_name, method_func in methods.items():
        print(f"--- {method_name} ---")
        start = time.time()
        best_params = method_func(X, y)
        duration = time.time() - start

        # Финальное обучение и проверка
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        model = CatBoostRegressor(**best_params, task_type="GPU", verbose=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        met = eval_metrics(y_val, preds) + [duration]

        # Сохраняем гиперпараметры в файл
        fname = f"best_params_{method_name}_ds{did}.json"
        serializable_params = {k: int(v) if isinstance(v, np.integer)
                       else float(v) if isinstance(v, np.floating)
                       else v
                       for k, v in best_params.items()}

        with open(fname, "w") as f:
            json.dump(serializable_params, f)

        # Добавляем строку в результаты
        results.append({
            "Method": method_name,
            "Dataset": did,
            "MAE": met[0],
            "RMSE": met[1],
            "R2": met[2],
            "Time": met[3]
        })
        df_results = pd.DataFrame(results)
        pivot_dfs.append(df_results.pivot(index="Method", columns="Dataset", values=["MAE", "RMSE", "R2", "Time"]))


# ==== Преобразуем в DataFrame для финального сравнения ====
df_results = pd.DataFrame(results)
pivot_df = df_results.pivot(index="Method", columns="Dataset", values=["MAE", "RMSE", "R2", "Time"])

# Красиво оформим имена столбцов
pivot_df.columns = [f"{did}_{metric}" for metric, did in pivot_df.columns]
pivot_df.reset_index(inplace=True)
pivot_df.to_csv("comparison_results.csv", index=False)
pivot_df



=== Датасет 42362 ===
--- BayesOpt ---
--- AMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
--- ImprovedAMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4

=== Датасет 3945 ===
--- BayesOpt ---
--- AMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
--- ImprovedAMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4

=== Датасет 46783 ===
--- BayesOpt ---
--- AMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
--- ImprovedAMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4

=== Датасет 45017 ===
--- BayesOpt ---
--- AMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
--- ImprovedAMFEB ---
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4


Unnamed: 0,Method,3945_MAE,42362_MAE,45017_MAE,46783_MAE,3945_RMSE,42362_RMSE,45017_RMSE,46783_RMSE,3945_R2,42362_R2,45017_R2,46783_R2,3945_Time,42362_Time,45017_Time,46783_Time
0,AMFEB,0.55873,43.67021,0.291119,0.651787,0.769663,67.598825,0.37079,1.092206,0.715796,0.998352,0.449329,0.963324,222.829523,200.36685,278.030051,1304.167468
1,BayesOpt,0.524377,51.297396,0.289104,0.543249,0.745109,72.249121,0.37446,1.025015,0.73364,0.998117,0.438376,0.967698,40.663628,42.234224,65.919647,128.201454
2,ImprovedAMFEB,0.680082,42.165792,0.307324,0.656362,0.875477,67.622549,0.387585,1.171766,0.632279,0.998351,0.398316,0.957786,203.518149,194.745998,325.023954,358.607791
