In [None]:
!unzip Gd_fps.zip

In [None]:
import pandas as pd
import numpy as np
import os
import random
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def seed_everything(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=42)

In [None]:
def load_datasets(path: str):
    files = [file for file in os.listdir(path) if file.endswith('.csv')]
    datasets = {}
    for file in files:
        df = pd.read_csv(os.path.join(path, file))
        datasets[file] = df
    return datasets

In [None]:
datasets = load_datasets(".")

In [None]:
from sklearn.feature_selection import VarianceThreshold


def filter_var(df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
    feature_cols = [c for c in df.columns if c != "lgK"]
    X = df[feature_cols].astype(np.float32).values
    y = df["lgK"].values
    vt = VarianceThreshold(threshold=0.01)  # remove features with <1% variance
    X = vt.fit_transform(X)
    print(f"Reduced from {len(feature_cols)} to {X.shape[1]} features")
    return X, y

In [None]:
def cross_validate(X, y, model, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_results = {}
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"Fold {fold}")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        rmse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        cv_results[fold] = {"rmse": rmse, "r2": r2}
        print(f"  RMSE: {rmse}, R2: {r2}")
    return cv_results

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold


def run_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    cv_scores = cross_validate(X, y, model)
    results = {"cv_scores": cv_scores}
    print("CV R2 scores:", cv_scores)
    
    pipeline = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_rmse = mean_squared_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)
    results.update({"test_rmse": test_rmse, "test_r2": test_r2})
    results.update({"pipeline": pipeline})
    print("Test RMSE:", test_rmse)
    print("Test R2:", test_r2)
    
    return results

In [None]:
import logging

timeout = 3.
n_jobs = -1
logging_level = logging.FATAL

In [None]:
all_results = {}

In [None]:
from fedot import Fedot

for dataset_name in datasets:
    df = datasets[dataset_name]
    model = Fedot(
            problem='regression', 
            timeout=timeout, 
            n_jobs=n_jobs, 
            logging_level=logging_level,
            seed=42
        )
    print(f"Running FEDOT on {dataset_name}")
    X, y = filter_var(df)
    results = run_model(X, y, model)
    all_results[dataset_name] = results