In [None]:
import numpy as np
from sklearn.feature_selection import f_regression
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
import os
import json
import pandas as pd
from matplotlib import pyplot as plt

from utils import seed_everything
from utils import load_datasets
from pipeline import Experiment, ModelFactory, Evaluator
from fedot import Fedot

In [None]:
!unzip Gd_fps.zip

In [None]:
SEED = 42

In [None]:
seed_everything(SEED)

In [None]:
datasets = load_datasets("Gd_fps")

In [None]:
lgbm_kwargs = dict(
        n_estimators=600,
        learning_rate=0.03,
        num_leaves=15,
        min_data_in_leaf=5,          # allow small leaves
        feature_fraction=0.7,
        bagging_fraction=0.8,
        bagging_freq=1,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=SEED
    )

In [None]:
from fedot.core.pipelines.pipeline_builder import PipelineBuilder

initial_assumption = PipelineBuilder().add_node('lgbmreg', params=lgbm_kwargs).build()

In [None]:
from fedot.core.repository.metrics_repository import RegressionMetricsEnum

fedot_kwargs = dict(
    problem='regression',
    timeout=60,
    n_jobs=-1,
    logging_level=50,
    initial_assumption=initial_assumption,
    max_depth=3,
    history_dir="fedot_history",
    seed=SEED,
    preset="best_quality",
    metric='rmse',
    with_tuning=True,
)

In [None]:
n_splits = None

In [None]:
fedot = ModelFactory(Fedot, fedot_kwargs)

In [None]:
evaluator = Evaluator()

In [None]:
experiment = Experiment(
    datasets=datasets,
    target_col="lgK",
    results_dir='results',
    model_factory=fedot,
    n_splits=n_splits,
    seed=SEED,
    var_threshold=0.00,
    fs_func=None,
    k_percents=None,
    evaluator=evaluator,
)

In [12]:
results = experiment.run()

  0%|          | 169/100000 [28:18<278:41:13, 10.05s/trial, best loss: 3.6441570830643215]
Results saved to results\Gd_ctopo_fp_skl_da_bonds\all_none.json

=== Dataset: Gd_ctopo_fp_skl_da_skl ===
Removed 1902 low-variance features (kept 146 of 2048)


Generations:   1%|          | 67/10000 [09:58<24:38:35,  8.93s/gen]


  1%|          | 1013/100000 [49:58<81:24:04,  2.96s/trial, best loss: 3.8370258910085226]
Results saved to results\Gd_ctopo_fp_skl_da_skl\all_none.json

=== Dataset: Gd_ctopo_fp_skl_da_skl_bonds ===
Removed 1874 low-variance features (kept 174 of 2048)


Generations:   0%|          | 47/10000 [12:57<45:44:35, 16.55s/gen]


  0%|          | 275/100000 [46:57<283:51:11, 10.25s/trial, best loss: 3.511565998031231] 
Results saved to results\Gd_ctopo_fp_skl_da_skl_bonds\all_none.json

=== Dataset: Gd_ctopo_fp_topo ===
Removed 1952 low-variance features (kept 96 of 2048)


Generations:   1%|          | 57/10000 [10:34<30:44:11, 11.13s/gen]


  0%|          | 383/100000 [49:20<213:52:36,  7.73s/trial, best loss: 3.9052129131341062]
Results saved to results\Gd_ctopo_fp_topo\all_none.json

=== Dataset: Gd_ctopo_fp_topo_da ===
Removed 1895 low-variance features (kept 153 of 2048)


Generations:   2%|▏         | 193/10000 [29:29<24:58:33,  9.17s/gen]


  0%|          | 200/100000 [30:20<252:18:15,  9.10s/trial, best loss: 3.8583895484439723]
Results saved to results\Gd_ctopo_fp_topo_da\all_none.json


In [13]:
with open("automl_results/automl_results.json", "w") as f:
    json.dump(results, f, indent=4)

In [14]:
def build_pipeline_summary(results_data):
    rows = []
    for dataset, dataset_data in results_data.items():
        for k_percent, info in dataset_data.items():
            fs_func = info['fs_func']
            
            # --- CV results, one row per fold ---
            cv_scores = info['cv_scores']
            if cv_scores:
                for fold_id, metrics in cv_scores.items():
                    # Parse pipeline JSON for this fold
                    pipe_path = metrics['pipeline_path']
                    if not os.path.exists(pipe_path):
                        print(f"Warning: missing pipeline file {pipe_path}")
                        continue
                    with open(pipe_path, "r") as f:
                        pipeline = json.load(f)
    
                    rows.append({
                        "dataset": dataset,
                        "features_percent": k_percent,
                        "fs_func": fs_func,
                        "pipeline_ops": pipeline.get("total_pipeline_operations", []),
                        "depth": pipeline.get("depth", None),
                        "fold": fold_id,
                        "stage": "cv",  # cross-validation
                        "rmse": metrics['rmse'],
                        "mae": metrics['mae'],
                        "r2": max(metrics['r2'], 0)
                    })
            
            # --- Train/test row ---
            test_scores = info['test_scores']
            pipe_path = test_scores['pipeline_path']
            if os.path.exists(pipe_path):
                with open(pipe_path, "r") as f:
                    pipeline = json.load(f)

                rows.append({
                    "dataset": dataset,
                    "features_percent": k_percent,
                    "fs_func": fs_func,
                    "pipeline_ops": pipeline.get("total_pipeline_operations", []),
                    "depth": pipeline.get("depth", None),
                    "fold": None,
                    "stage": "train_test",  # final evaluation
                    "rmse": test_scores["rmse"],
                    "mae": test_scores["mae"],
                    "r2": test_scores["r2"]
                })

    return pd.DataFrame(rows)

summary_df = build_pipeline_summary(results)

In [15]:
summary_df.to_csv("automl_results/pipeline_summary.csv", index=False)

In [16]:
train_test_df = summary_df[summary_df['stage'] == 'train_test']

In [17]:
with open("baseline_results.json", "r") as f:
    baseline_data = json.load(f)

In [18]:
baseline_records = []
for dataset, metrics in baseline_data.items():
    baseline_records.append({
        "dataset": dataset[:-4],  # remove .csv suffix
        "baseline_rmse": np.sqrt(metrics['holdout_metrics']['RMSE']),
        "baseline_mae": metrics['holdout_metrics']['MAE'],
        "baseline_r2": max(metrics['holdout_metrics']['R2'], 0)
    })
baseline_records = pd.DataFrame(baseline_records)

In [19]:
train_test_df = train_test_df.merge(baseline_records, on="dataset", how="left")

In [20]:
train_test_df.to_csv("automl_results/results_comparison.csv", index=False)