In [9]:
import numpy as np
from sklearn.feature_selection import f_regression
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
import os
import json
import pandas as pd
from matplotlib import pyplot as plt

from utils import seed_everything
from utils import load_datasets
from pipeline import FedotPipeline

In [None]:
!unzip Gd_fps.zip

In [10]:
SEED = 42

In [11]:
seed_everything(SEED)

In [12]:
datasets = load_datasets("Gd_fps")

In [13]:
from fedot.core.pipelines.pipeline_builder import PipelineBuilder

fedot_kwargs = dict(
    problem='regression',
    timeout=60,
    n_jobs=-1,
    logging_level=50,
    initial_assumption=PipelineBuilder().add_node('lgbmreg').build(),
)

In [14]:
n_splits = 5

In [15]:
pipeline = FedotPipeline(
    datasets=datasets,
    target_col="lgK",
    results_dir='results',
    n_splits=n_splits,
    seed=SEED,
    var_threshold=0.01,
    fs_func=None,
    k_percents=None,
    fedot_kwargs=fedot_kwargs,
)

In [16]:
all_results = pipeline.run()

Removed 1781 low-variance features (kept 267 of 2048)

=== Dataset: Gd_ctopo_fp_cmplx | Samples: 217 | Features: 267 ===
Loading cached results for Gd_ctopo_fp_cmplx, key=all, fs=none
Removed 1771 low-variance features (kept 277 of 2048)

=== Dataset: Gd_ctopo_fp_cmplx_da | Samples: 217 | Features: 277 ===
Loading cached results for Gd_ctopo_fp_cmplx_da, key=all, fs=none
Removed 1763 low-variance features (kept 285 of 2048)

=== Dataset: Gd_ctopo_fp_cmplx_da_bonds | Samples: 217 | Features: 285 ===
Loading cached results for Gd_ctopo_fp_cmplx_da_bonds, key=all, fs=none
Removed 1754 low-variance features (kept 294 of 2048)

=== Dataset: Gd_ctopo_fp_cmplx_da_sub | Samples: 217 | Features: 294 ===
Loading cached results for Gd_ctopo_fp_cmplx_da_sub, key=all, fs=none
Removed 1750 low-variance features (kept 298 of 2048)

=== Dataset: Gd_ctopo_fp_cmplx_da_sub_bonds | Samples: 217 | Features: 298 ===
Loading cached results for Gd_ctopo_fp_cmplx_da_sub_bonds, key=all, fs=none
Removed 1747 low

Generations:   1%|          | 59/10000 [35:24<99:26:53, 36.01s/gen] 


  0%|          | 93/100000 [24:38<441:09:04, 15.90s/trial, best loss: 3.675181692323722] 
Test RMSE: 3.6977, R2: 0.2952, MAE: 2.8703
Pipeline saved to results\Gd_ctopo_fp_cmplx_full\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_cmplx_full\all_none.json
Removed 1786 low-variance features (kept 262 of 2048)

=== Dataset: Gd_ctopo_fp_ligand | Samples: 217 | Features: 262 ===


Generations:   0%|          | 19/10000 [20:52<182:44:06, 65.91s/gen]


  0%|          | 59/100000 [39:03<1102:27:37, 39.71s/trial, best loss: 3.561054738938702] 
Test RMSE: 3.8273, R2: 0.2450, MAE: 3.1352
Pipeline saved to results\Gd_ctopo_fp_ligand\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_ligand\all_none.json
Removed 1997 low-variance features (kept 51 of 2048)

=== Dataset: Gd_ctopo_fp_skl | Samples: 217 | Features: 51 ===


Generations:   1%|          | 51/10000 [10:39<34:38:59, 12.54s/gen]


  1%|          | 1169/100000 [49:18<69:28:04,  2.53s/trial, best loss: 3.976822569165903] 
Test RMSE: 3.7398, R2: 0.2791, MAE: 3.0081
Pipeline saved to results\Gd_ctopo_fp_skl\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_skl\all_none.json
Removed 1959 low-variance features (kept 89 of 2048)

=== Dataset: Gd_ctopo_fp_skl_da | Samples: 217 | Features: 89 ===


Generations:   1%|          | 83/10000 [26:10<52:08:19, 18.93s/gen]


  0%|          | 337/100000 [33:42<166:10:37,  6.00s/trial, best loss: 3.8240490206237077]
Test RMSE: 3.9530, R2: 0.1945, MAE: 3.0127
Pipeline saved to results\Gd_ctopo_fp_skl_da\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_skl_da\all_none.json
Removed 1934 low-variance features (kept 114 of 2048)

=== Dataset: Gd_ctopo_fp_skl_da_bonds | Samples: 217 | Features: 114 ===


Generations:   1%|          | 58/10000 [18:26<52:42:20, 19.08s/gen]


  0%|          | 462/100000 [41:30<149:03:19,  5.39s/trial, best loss: 3.529449780848002]
Test RMSE: 3.9347, R2: 0.2020, MAE: 3.1448
Pipeline saved to results\Gd_ctopo_fp_skl_da_bonds\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_skl_da_bonds\all_none.json
Removed 1944 low-variance features (kept 104 of 2048)

=== Dataset: Gd_ctopo_fp_skl_da_skl | Samples: 217 | Features: 104 ===


Generations:   1%|          | 54/10000 [13:44<42:12:19, 15.28s/gen]


  0%|          | 499/100000 [46:08<153:21:41,  5.55s/trial, best loss: 3.7290162423039703]
Test RMSE: 3.8291, R2: 0.2442, MAE: 3.0398
Pipeline saved to results\Gd_ctopo_fp_skl_da_skl\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_skl_da_skl\all_none.json
Removed 1924 low-variance features (kept 124 of 2048)

=== Dataset: Gd_ctopo_fp_skl_da_skl_bonds | Samples: 217 | Features: 124 ===


Generations:   1%|▏         | 138/10000 [35:57<42:49:46, 15.63s/gen]


  0%|          | 256/100000 [23:58<155:43:22,  5.62s/trial, best loss: 3.42534562784142]
Test RMSE: 3.7913, R2: 0.2591, MAE: 3.0633
Pipeline saved to results\Gd_ctopo_fp_skl_da_skl_bonds\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_skl_da_skl_bonds\all_none.json
Removed 1987 low-variance features (kept 61 of 2048)

=== Dataset: Gd_ctopo_fp_topo | Samples: 217 | Features: 61 ===


Generations:   2%|▏         | 165/10000 [36:04<35:49:53, 13.12s/gen]


  0%|          | 117/100000 [23:45<337:56:10, 12.18s/trial, best loss: 3.8519276229767043]
Test RMSE: 4.4091, R2: -0.0021, MAE: 3.7133
Pipeline saved to results\Gd_ctopo_fp_topo\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_topo\all_none.json
Removed 1963 low-variance features (kept 85 of 2048)

=== Dataset: Gd_ctopo_fp_topo_da | Samples: 217 | Features: 85 ===


Generations:   1%|▏         | 137/10000 [35:46<42:55:09, 15.67s/gen]


  0%|          | 80/100000 [24:02<500:34:42, 18.04s/trial, best loss: 3.8476234435886014]
Test RMSE: 4.0192, R2: 0.1673, MAE: 3.2033
Pipeline saved to results\Gd_ctopo_fp_topo_da\all_none_train_test_pipeline.json
Results saved to results\Gd_ctopo_fp_topo_da\all_none.json


In [17]:
all_results

{'Gd_ctopo_fp_cmplx': {'all': {'fs_func': 'none',
   'k_percent': None,
   'results': {'cv_scores': {},
    'test_rmse': 3.7514166665947557,
    'test_r2': 0.2745855843087137,
    'test_mae': 2.954855009229709,
    'pipeline_path': 'results\\Gd_ctopo_fp_cmplx\\all_none_train_test_pipeline.json'}}},
 'Gd_ctopo_fp_cmplx_da': {'all': {'fs_func': 'none',
   'k_percent': None,
   'results': {'cv_scores': {},
    'test_rmse': 3.7803283197390267,
    'test_r2': 0.2633611591972326,
    'test_mae': 3.0078119605775413,
    'pipeline_path': 'results\\Gd_ctopo_fp_cmplx_da\\all_none_train_test_pipeline.json'}}},
 'Gd_ctopo_fp_cmplx_da_bonds': {'all': {'fs_func': 'none',
   'k_percent': None,
   'results': {'cv_scores': {},
    'test_rmse': 3.7925831828591448,
    'test_r2': 0.25857742631528247,
    'test_mae': 3.131402412096099,
    'pipeline_path': 'results\\Gd_ctopo_fp_cmplx_da_bonds\\all_none_train_test_pipeline.json'}}},
 'Gd_ctopo_fp_cmplx_da_sub': {'all': {'fs_func': 'none',
   'k_percent': N

In [18]:
with open("automl_results/automl_results.json", "w") as f:
    json.dump(all_results, f, indent=4)

In [None]:
def plot_by_percent(data: dict):
    records = []
    for dataset, dataset_data in data.items():
        for k_percent, info in dataset_data.items():
            fs_func = info['fs_func']
            k_val = k_percent
            for fold, metrics in info['results']['cv_scores'].items():
                r2 = max(metrics['r2'], 0)
                records.append({
                    'dataset': dataset,
                    'k_percent': k_val,
                    'fold': fold,
                    'rmse': metrics['rmse'],
                    'r2': r2,
                    'mae': metrics['mae'],
                    'fs_func': fs_func
                })
    
    df = pd.DataFrame(records)

    for k_val in df['k_percent'].unique():
        subset = df[df['k_percent'] == k_val]
        fs_func = subset['fs_func'].iloc[0] if not subset.empty else "unknown"
        for metric in ['rmse', 'mae', 'r2']:
            plt.figure(figsize=(20, 6))
            subset.boxplot(column=metric, by="dataset")
            plt.title(f"Feature percent={k_val} | Metric={metric.upper()} | fs_func={fs_func}")
            plt.suptitle("")
            plt.xlabel("Dataset")
            plt.ylabel(metric.upper())
            plt.xticks(rotation=90)
            # plt.tight_layout()
            plt.show()

plot_by_percent(all_results)

In [30]:
def build_pipeline_summary(results_data, base_path="results"):
    rows = []
    for dataset, dataset_data in results_data.items():
        for k_percent, info in dataset_data.items():
            fs_func = info['fs_func']
            res = info['results']
            
            # --- CV results, one row per fold ---
            cv_scores = res['cv_scores']
            for fold_id, metrics in cv_scores.items():
                # Parse pipeline JSON for this fold
                pipe_path = metrics['pipeline_path']
                if not os.path.exists(pipe_path):
                    print(f"Warning: missing pipeline file {pipe_path}")
                    continue
                with open(pipe_path, "r") as f:
                    pipeline = json.load(f)

                rows.append({
                    "dataset": dataset,
                    "features_percent": k_percent,
                    "fs_func": fs_func,
                    "pipeline_ops": pipeline.get("total_pipeline_operations", []),
                    "depth": pipeline.get("depth", None),
                    "fold": fold_id,
                    "stage": "cv",  # cross-validation
                    "rmse": metrics['rmse'],
                    "mae": metrics['mae'],
                    "r2": max(metrics['r2'], 0)
                })
            
            # --- Train/test row ---
            pipe_path = res['pipeline_path']
            if os.path.exists(pipe_path):
                with open(pipe_path, "r") as f:
                    pipeline = json.load(f)

                rows.append({
                    "dataset": dataset,
                    "features_percent": k_percent,
                    "fs_func": fs_func,
                    "pipeline_ops": pipeline.get("total_pipeline_operations", []),
                    "depth": pipeline.get("depth", None),
                    "fold": None,
                    "stage": "train_test",  # final evaluation
                    "rmse": res.get('test_rmse', None),
                    "mae": res.get('test_mae', None),
                    "r2": max(res.get('test_r2', 0), 0)
                })

    return pd.DataFrame(rows)

summary_df = build_pipeline_summary(all_results)

In [31]:
summary_df.to_csv("automl_results/pipeline_summary.csv", index=False)

In [32]:
train_test_df = summary_df[summary_df['stage'] == 'train_test']

In [33]:
summary_df

Unnamed: 0,dataset,features_percent,fs_func,pipeline_ops,depth,fold,stage,rmse,mae,r2
0,Gd_ctopo_fp_cmplx,all,none,"[resample, isolation_forest_reg, ransac_lin_re...",4,,train_test,3.751417,2.954855,0.274586
1,Gd_ctopo_fp_cmplx_da,all,none,[svr],1,,train_test,3.780328,3.007812,0.263361
2,Gd_ctopo_fp_cmplx_da_bonds,all,none,"[lgbmreg, pca, ransac_non_lin_reg, linear]",3,,train_test,3.792583,3.131402,0.258577
3,Gd_ctopo_fp_cmplx_da_sub,all,none,[ridge],1,,train_test,3.630963,2.925541,0.320422
4,Gd_ctopo_fp_cmplx_da_sub_bonds,all,none,[ridge],1,,train_test,3.569822,2.926176,0.343116
5,Gd_ctopo_fp_cmplx_full,all,none,"[resample, normalization, sgdr, poly_features,...",6,,train_test,3.697745,2.870306,0.295194
6,Gd_ctopo_fp_ligand,all,none,"[lgbmreg, isolation_forest_reg, svr]",2,,train_test,3.827253,3.135244,0.24496
7,Gd_ctopo_fp_skl,all,none,[ridge],1,,train_test,3.739803,3.008108,0.27907
8,Gd_ctopo_fp_skl_da,all,none,"[resample, poly_features, ransac_lin_reg, sgdr...",5,,train_test,3.952954,3.012711,0.194549
9,Gd_ctopo_fp_skl_da_bonds,all,none,[ridge],1,,train_test,3.934744,3.144821,0.201953


In [34]:
with open("baseline_results.json", "r") as f:
    baseline_data = json.load(f)

In [35]:
baseline_records = []
for dataset, metrics in baseline_data.items():
    baseline_records.append({
        "dataset": dataset[:-4],  # remove .csv suffix
        "baseline_rmse": np.sqrt(metrics['holdout_metrics']['RMSE']),
        "baseline_mae": metrics['holdout_metrics']['MAE'],
        "baseline_r2": max(metrics['holdout_metrics']['R2'], 0)
    })
baseline_records = pd.DataFrame(baseline_records)

In [36]:
train_test_df = train_test_df.merge(baseline_records, on="dataset", how="left")

In [37]:
train_test_df.to_csv("automl_results/results_comparison.csv", index=False)