In [135]:
from utils import seed_everything
from utils import load_datasets
from fedot import Fedot
from fedot.core.pipelines.pipeline_builder import PipelineBuilder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
from typing import Tuple, Dict
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

SEED = 42
seed_everything(SEED)
datasets = load_datasets("Gd_fps")

In [136]:
lgbm_kwargs = dict(
    n_estimators=600,
    learning_rate=0.03,
    num_leaves=15,
    min_data_in_leaf=5,
    feature_fraction=0.7,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=SEED
)

In [137]:
def prepare_data(
    df: pd.DataFrame,
    target_col: str = "lgK",
    test_size: float = 0.2,
    random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    feature_cols = [c for c in df.columns if c != target_col]
    X = df[feature_cols].astype(np.float32).values
    y = df[target_col].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    vt = VarianceThreshold(threshold=0.0)
    X_train = vt.fit_transform(X_train)
    X_test = vt.transform(X_test)
    
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train):
    model = Fedot(
        problem='regression',
        timeout=1,
        n_jobs=-1,
        logging_level=20,
        seed=SEED,
    )
    model.fit(
        X_train, 
        y_train, 
        predefined_model=PipelineBuilder().add_node('lgbmreg', params=lgbm_kwargs).build()
    )
    return model


# -------------------------
# 4) Evaluation (hold-out)
# -------------------------

def evaluate_model(model, X_test, y_test) -> Dict[str, float]:
    """
    Evaluate a trained model on hold-out data; print metrics.
    """
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")
    return {"RMSE": rmse, "MAE": mae, "R2": r2}

In [138]:
def run_final_pipeline(df: pd.DataFrame, target_col: str = "lgK", random_state: int = 42):
    X_train, X_test, y_train, y_test = prepare_data(df, target_col=target_col, random_state=random_state)
    fedot_fitted = train_model(X_train, y_train)
    print("\nHold-out performance:")
    metrics = evaluate_model(fedot_fitted, X_test, y_test)
    return fedot_fitted, metrics

In [141]:
all_metrics = {}
for name, df in datasets.items():
    print(f"\nDataset: {name}")
    _, metrics = run_final_pipeline(df, target_col="lgK", random_state=SEED)
    all_metrics[name] = metrics
metrics_df = pd.DataFrame(all_metrics).T


Dataset: Gd_ctopo_fp_cmplx
2025-10-10 16:38:09,867 - TableTypesCorrector - Preprocessing was unable to define the categorical columns
2025-10-10 16:38:12,608 - FEDOT logger - Final pipeline: {'depth': 1, 'length': 1, 'nodes': [lgbmreg]}
lgbmreg - {'boosting_type': 'gbdt', 'max_depth': -1, 'bagging_fraction': 0.8, 'extra_trees': False, 'enable_categorical': True, 'use_eval_set': True, 'early_stopping_rounds': 30, 'n_jobs': 1, 'verbose': -1, 'n_estimators': 600, 'learning_rate': 0.03, 'num_leaves': 15, 'min_data_in_leaf': 5, 'feature_fraction': 0.7, 'bagging_freq': 1, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'random_state': 42}
2025-10-10 16:38:12,613 - MemoryAnalytics - Memory consumption for finish in main session: current 3.9 MiB, max: 8.0 MiB

Hold-out performance:
RMSE: 3.9088 | MAE: 3.2090 | R²: 0.2125

Dataset: Gd_ctopo_fp_cmplx_da
2025-10-10 16:38:14,094 - TableTypesCorrector - Preprocessing was unable to define the categorical columns
2025-10-10 16:38:16,626 - FEDOT logger - Final 

In [142]:
metrics_df

Unnamed: 0,RMSE,MAE,R2
Gd_ctopo_fp_cmplx,3.908762,3.209043,0.212458
Gd_ctopo_fp_cmplx_da,3.813934,3.118467,0.250206
Gd_ctopo_fp_cmplx_da_bonds,3.810822,3.112918,0.251429
Gd_ctopo_fp_cmplx_da_sub,3.780636,3.038336,0.263241
Gd_ctopo_fp_cmplx_da_sub_bonds,3.593826,2.960939,0.334252
Gd_ctopo_fp_cmplx_full,3.742468,3.073256,0.278042
Gd_ctopo_fp_ligand,3.900381,3.09807,0.215831
Gd_ctopo_fp_skl,3.978201,3.151416,0.184227
Gd_ctopo_fp_skl_da,4.044272,3.188086,0.156906
Gd_ctopo_fp_skl_da_bonds,3.901985,3.02111,0.215186
