In [9]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)
os.makedirs("hyperparams", exist_ok=True)

def load_data():
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                            test_size=CONFIG["test_size"],
                            random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    rankings = {}

    rf = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5,
                                random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rf_scores = rf.feature_importances_
    rankings['RF'] = list(X_train.columns[np.argsort(rf_scores)[::-1]])

    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]),
              n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rfe_scores = rfe.ranking_
    rankings['RFE_RF'] = list(X_train.columns[np.argsort(rfe_scores)])

    xgb = XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.1,
                       random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    xgb_scores = xgb.feature_importances_
    rankings['XGB'] = list(X_train.columns[np.argsort(xgb_scores)[::-1]])

    hybrid_scores = (rf_scores + xgb_scores) / 2
    rankings['RF_XGB'] = list(X_train.columns[np.argsort(hybrid_scores)[::-1]])

    return rankings

def get_model_config(model_name):
    if model_name == 'RF':
        model_class = RandomForestRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [CONFIG["random_state"]]
        }
    elif model_name == 'XGB':
        model_class = XGBRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'random_state': [CONFIG["random_state"]]
        }
    return model_class, param_grid

def tune_hyperparameters(model, param_grid, X, y):
    scoring = {
    'r2': 'r2',
    'mae': 'neg_mean_absolute_error',
    'mse': 'neg_mean_squared_error'
    }
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring, #CONFIG["metric"],  
        cv=CONFIG["cv_folds"],
        n_jobs=-1,
        verbose=0,
        refit="r2"
    )
    grid_search.fit(X, y)
    ind=np.argmax(grid_search.cv_results_["mean_test_r2"])
    r2 = grid_search.cv_results_["mean_test_r2"][ind]
    mae = -1*grid_search.cv_results_['mean_test_mae'][ind]
    mse = -1*grid_search.cv_results_["mean_test_mse"][ind]
    return {
        "best_model": grid_search.best_estimator_,
        "best_params": grid_search.best_params_,
        "r2": r2,
        "mse":mse,
        "mae":mae
    }

def run_pipeline():
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    all_results = []
    combined_preds = pd.DataFrame({'true': y_test.reset_index(drop=True)})
    cv_train_preds = pd.DataFrame({'true': y_train.reset_index(drop=True)})

    for fs_name, features in feature_rankings.items():
        for model_name, Model in [('RF', RandomForestRegressor), ('XGB', XGBRegressor)]:
            print(f"\nEvaluating {model_name} with {fs_name} feature selection...")

            best_k = None
            best_cv_r2 = -np.inf

            # Step 1: Find best k using CV
            for k in range(1, len(features) + 1):
                k_features = features[:k]
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                avg_r2 = np.mean(fold_r2)

                if avg_r2 > best_cv_r2:
                    best_cv_r2 = avg_r2
                    best_k = k

            # Step 2: HPO for spaced subset sizes [5, 10, ..., best_k]
            subset_ks = [k for k in range(5, best_k + 1, 5)]
            if best_k not in subset_ks:
                subset_ks.append(best_k)

            for k in subset_ks:
                k_features = features[:k]
                fold_preds = np.zeros(len(y_train))
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_preds[val_idx] = preds
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                cv_r2 = np.mean(fold_r2)
                cv_mse = mean_squared_error(y_train, fold_preds)
                cv_mae = mean_absolute_error(y_train, fold_preds)

                model_class, param_grid = get_model_config(model_name)
                print(f"  → HPO for {model_name} with {fs_name} using top-{k} features")
                hpo_results = tune_hyperparameters(
                    model_class(random_state=CONFIG["random_state"]),
                    param_grid,
                    X_train[k_features],
                    y_train
                )

                test_preds = hpo_results["best_model"].predict(X_test[k_features])
                test_r2 = r2_score(y_test, test_preds)
                test_mse = mean_squared_error(y_test, test_preds)
                test_mae = mean_absolute_error(y_test, test_preds)

                model_id = f"{fs_name}_{model_name}_Top{k}"
                combined_preds[f'pred_{model_id}'] = test_preds
                cv_train_preds[f'pred_{model_id}'] = fold_preds

                joblib.dump(hpo_results["best_model"], f"models/Model_{model_id}.pkl")

                with open(f"features/Features_{model_id}.json", 'w') as f:
                    json.dump(k_features, f)

                with open(f"hyperparams/BestParams_{model_id}.json", 'w') as f:
                    json.dump(hpo_results["best_params"], f)

                all_results.append({
                    'FS_Method': fs_name,
                    'Model': model_name,
                    'Features_Used': k,
                    'Best_Params': hpo_results["best_params"],
                    'CV_R2': cv_r2,
                    'CV_MSE': cv_mse,
                    'CV_MAE': cv_mae,
                    'Test_R2': test_r2,
                    'Test_MSE': test_mse,
                    'Test_MAE': test_mae
                })

    metrics_df = pd.DataFrame(all_results)
    metrics_df['R2_Rank'] = metrics_df['Test_R2'].rank(ascending=False, method='min')
    metrics_df = metrics_df.sort_values('R2_Rank')
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)
    combined_preds.to_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv", index=False)
    cv_train_preds.to_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv", index=False)

if __name__ == '__main__':
    run_pipeline()



Evaluating RF with RF feature selection...
  → HPO for RF with RF using top-5 features
  → HPO for RF with RF using top-10 features
  → HPO for RF with RF using top-15 features
  → HPO for RF with RF using top-16 features

Evaluating XGB with RF feature selection...
  → HPO for XGB with RF using top-5 features
  → HPO for XGB with RF using top-10 features
  → HPO for XGB with RF using top-15 features
  → HPO for XGB with RF using top-20 features
  → HPO for XGB with RF using top-25 features
  → HPO for XGB with RF using top-30 features
  → HPO for XGB with RF using top-35 features
  → HPO for XGB with RF using top-36 features

Evaluating RF with RFE_RF feature selection...
  → HPO for RF with RFE_RF using top-5 features
  → HPO for RF with RFE_RF using top-10 features
  → HPO for RF with RFE_RF using top-15 features
  → HPO for RF with RFE_RF using top-20 features

Evaluating XGB with RFE_RF feature selection...
  → HPO for XGB with RFE_RF using top-5 features
  → HPO for XGB with RFE

In [85]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

# ---------------------
# Load config & data
# ---------------------
with open("config.json") as f:
    CONFIG = json.load(f)

cv_df = pd.read_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv")
test_df = pd.read_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv")
metrics_df = pd.read_csv(f"metrics/{CONFIG['results_csv_path']}")

X_train = cv_df.drop(columns=["true"])
y_train = cv_df["true"]
X_test = test_df.drop(columns=["true"])
y_test = test_df["true"]

model_columns = X_train.columns.tolist()

# ---------------------
# Feature Selection Methods
# ---------------------
def fs_rf(X, y, n):
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X, y)
    scores = rf.feature_importances_
    return X.columns[np.argsort(scores)[::-1][:n]]

def fs_rfe_rf(X, y, n):
    rfe = RFE(estimator=RandomForestRegressor(random_state=42), n_features_to_select=n)
    rfe.fit(X, y)
    return X.columns[rfe.support_]

def fs_xgb(X, y, n):
    xgb = XGBRegressor(random_state=42)
    xgb.fit(X, y)
    scores = xgb.feature_importances_
    return X.columns[np.argsort(scores)[::-1][:n]]

def fs_rf_xgb(X, y, n):
    rf = RandomForestRegressor(random_state=42)
    xgb = XGBRegressor(random_state=42)
    rf.fit(X, y)
    xgb.fit(X, y)
    avg_scores = (rf.feature_importances_ + xgb.feature_importances_) / 2
    return X.columns[np.argsort(avg_scores)[::-1][:n]]

fs_methods = {
    "RF": fs_rf,
    "RFE_RF": fs_rfe_rf,
    "XGB": fs_xgb,
    "RF_XGB": fs_rf_xgb
}

# ---------------------------------------
# Meta-models
# ---------------------------------------
meta_models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=CONFIG["random_state"]),
    "XGBoost": XGBRegressor(random_state=CONFIG["random_state"])
}

# ---------------------------------------
# Shared Top-K Range
# ---------------------------------------
max_models = len(metrics_df)
step = 10
k_values = list(range(10, max_models, step))
if max_models not in k_values:
    k_values.append(max_models)

# ---------------------
# Method 1 & 2: Varying Top-k Model Ensemble
# ---------------------
print("\n=== Method 1 & 2: Ensemble Evaluation for k in [10, 20, ..., Max] ===")

for k in k_values:
    # Method 1: Random k models
    random_cols = random.sample(model_columns, k)
    y_pred_train_random = X_train[random_cols].mean(axis=1)
    y_pred_test_random = X_test[random_cols].mean(axis=1)

    print(f"\n--- Method 1: Random {k} Model Averaging ---")
    print("Train CV R2 :", r2_score(y_train, y_pred_train_random))
    print("Test     R2 :", r2_score(y_test, y_pred_test_random))
    print("Test     MSE:", mean_squared_error(y_test, y_pred_test_random))
    print("Test     MAE:", mean_absolute_error(y_test, y_pred_test_random))

    # Method 2: Top k models by CV_R2
    top_k_models = metrics_df.sort_values("CV_R2", ascending=False).head(k)
    top_k_cols = [
        f"pred_{row['FS_Method']}_{row['Model']}_Top{int(row['Features_Used'])}"
        for _, row in top_k_models.iterrows()
    ]

    y_pred_test_top = X_test[top_k_cols].mean(axis=1)

    print(f"\n--- Method 2: Top {k} Models (CV_R2) — Simple Average ---")
    print("Test     R2 :", r2_score(y_test, y_pred_test_top))
    print("Test     MSE:", mean_squared_error(y_test, y_pred_test_top))
    print("Test     MAE:", mean_absolute_error(y_test, y_pred_test_top))

# ---------------------
# Method 3: Feature Selection + Meta Model (on model predictions)
# ---------------------
print("\n=== Method 3: FS + Meta Models for Top-k Models ===")

all_method3_results = []

for top_k in k_values:
    print(f"\n--- Evaluating Method 3 with Top-{top_k} Models ---")

    top_k_models = metrics_df.sort_values("CV_R2", ascending=False).head(top_k)
    top_model_ids = [
        f"pred_{row['FS_Method']}_{row['Model']}_Top{int(row['Features_Used'])}"
        for _, row in top_k_models.iterrows()
    ]

    X_train_top = X_train[top_model_ids]
    X_test_top = X_test[top_model_ids]

    for fs_name, fs_func in fs_methods.items():
        for model_name, model in meta_models.items():
            best_r2 = -np.inf
            best_n = None
            best_metrics = {}

            for n in range(2, top_k + 1):
                if n > len(top_model_ids):
                    break
                selected_feats = fs_func(X_train_top, y_train, n=n)
                X_train_fs = X_train_top[selected_feats]
                X_test_fs = X_test_top[selected_feats]

                model.fit(X_train_fs, y_train)
                y_pred = model.predict(X_test_fs)

                r2 = r2_score(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                mae = mean_absolute_error(y_test, y_pred)

                if r2 > best_r2:
                    best_r2 = r2
                    best_n = n
                    best_metrics = {
                        "R2": r2,
                        "MSE": mse,
                        "MAE": mae,
                        "Selected_Features": list(selected_feats)
                    }

            print(f"Top-{top_k} → {fs_name} + {model_name}: Best R2 = {best_r2:.4f} (n={best_n})")

            all_method3_results.append({
                "Method": "FS_MetaModel",
                "Top_K": top_k,
                "Meta_Model": model_name,
                "FS_Method": fs_name,
                "Best_n": best_n,
                **best_metrics
            })

# Save Method 3 results
pd.DataFrame(all_method3_results).to_csv("metrics/best_meta_model_results.csv", index=False)
print("\nSaved Method 3 results to: metrics/best_meta_model_results.csv")



=== Method 1 & 2: Ensemble Evaluation for k in [10, 20, ..., Max] ===

--- Method 1: Random 10 Model Averaging ---
Train CV R2 : 0.7713080189463937
Test     R2 : 0.7756311450908048
Test     MSE: 4743.854971616518
Test     MAE: 48.87691787320144

--- Method 2: Top 10 Models (CV_R2) — Simple Average ---
Test     R2 : 0.7737461576423652
Test     MSE: 4783.709465157226
Test     MAE: 50.58740458016972

--- Method 1: Random 20 Model Averaging ---
Train CV R2 : 0.7760999320060629
Test     R2 : 0.7724348260457022
Test     MSE: 4811.435090965608
Test     MAE: 49.34603622716597

--- Method 2: Top 20 Models (CV_R2) — Simple Average ---
Test     R2 : 0.7742757121238147
Test     MSE: 4772.51303746862
Test     MAE: 50.236808263367756

--- Method 1: Random 30 Model Averaging ---
Train CV R2 : 0.7751716608227526
Test     R2 : 0.7754275894551974
Test     MSE: 4748.158770440857
Test     MAE: 49.03559251490448

--- Method 2: Top 30 Models (CV_R2) — Simple Average ---
Test     R2 : 0.7782642945333733
Tes

In [None]:
import json
import pandas as pd
import numpy as np
import joblib
import os
import re
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create dataset-specific folder name
dataset_slug = re.sub(r'\W+', '_', os.path.splitext(os.path.basename(CONFIG["dataset_path"]))[0])

# Define dataset-specific output directories
base_dirs = {
    "predictions": os.path.join(CONFIG["predictions_dir"], dataset_slug),
    "models": os.path.join("models", dataset_slug),
    "features": os.path.join("features", dataset_slug),
    "metrics": os.path.join("metrics", dataset_slug),
    "hyperparams": os.path.join("hyperparams", dataset_slug)
}

# Create all output directories
for path in base_dirs.values():
    os.makedirs(path, exist_ok=True)

def load_data():
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y,
                            test_size=CONFIG["test_size"],
                            random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    rankings = {}

    rf = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5,
                                random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rf_scores = rf.feature_importances_
    rankings['RF'] = list(X_train.columns[np.argsort(rf_scores)[::-1]])

    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]),
              n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rfe_scores = rfe.ranking_
    rankings['RFE_RF'] = list(X_train.columns[np.argsort(rfe_scores)])

    xgb = XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.1,
                       random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    xgb_scores = xgb.feature_importances_
    rankings['XGB'] = list(X_train.columns[np.argsort(xgb_scores)[::-1]])

    hybrid_scores = (rf_scores + xgb_scores) / 2
    rankings['RF_XGB'] = list(X_train.columns[np.argsort(hybrid_scores)[::-1]])

    return rankings

def get_model_config(model_name):
    if model_name == 'RF':
        model_class = RandomForestRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [CONFIG["random_state"]]
        }
    elif model_name == 'XGB':
        model_class = XGBRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'random_state': [CONFIG["random_state"]]
        }
    return model_class, param_grid

def tune_hyperparameters(model, param_grid, X, y):
    scoring = {
        'r2': 'r2',
        'mae': 'neg_mean_absolute_error',
        'mse': 'neg_mean_squared_error'
    }
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        cv=CONFIG["cv_folds"],
        n_jobs=-1,
        verbose=0,
        refit="r2"
    )
    grid_search.fit(X, y)
    ind = np.argmax(grid_search.cv_results_["mean_test_r2"])
    r2 = grid_search.cv_results_["mean_test_r2"][ind]
    mae = -1 * grid_search.cv_results_['mean_test_mae'][ind]
    mse = -1 * grid_search.cv_results_["mean_test_mse"][ind]
    return {
        "best_model": grid_search.best_estimator_,
        "best_params": grid_search.best_params_,
        "r2": r2,
        "mse": mse,
        "mae": mae
    }

def run_pipeline():
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    all_results = []
    combined_preds = pd.DataFrame({'true': y_test.reset_index(drop=True)})
    cv_train_preds = pd.DataFrame({'true': y_train.reset_index(drop=True)})

    for fs_name, features in feature_rankings.items():
        for model_name, Model in [('RF', RandomForestRegressor), ('XGB', XGBRegressor)]:
            print(f"\nEvaluating {model_name} with {fs_name} feature selection...")

            best_k = None
            best_cv_r2 = -np.inf

            # Step 1: Find best k using CV
            for k in range(1, len(features) + 1):
                k_features = features[:k]
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                avg_r2 = np.mean(fold_r2)

                if avg_r2 > best_cv_r2:
                    best_cv_r2 = avg_r2
                    best_k = k

            # Step 2: HPO for spaced subset sizes
            subset_k_start = CONFIG.get("subset_k_start")
            subset_k_step = CONFIG.get("subset_k_step")
            subset_ks = [k for k in range(subset_k_start, best_k + 1, subset_k_step)]
            if best_k not in subset_ks:
                subset_ks.append(best_k)

            for k in subset_ks:
                k_features = features[:k]
                fold_preds = np.zeros(len(y_train))
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_preds[val_idx] = preds
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                cv_r2 = np.mean(fold_r2)
                cv_mse = mean_squared_error(y_train, fold_preds)
                cv_mae = mean_absolute_error(y_train, fold_preds)

                model_class, param_grid = get_model_config(model_name)
                print(f"  → HPO for {model_name} with {fs_name} using top-{k} features")
                hpo_results = tune_hyperparameters(
                    model_class(random_state=CONFIG["random_state"]),
                    param_grid,
                    X_train[k_features],
                    y_train
                )

                test_preds = hpo_results["best_model"].predict(X_test[k_features])
                test_r2 = r2_score(y_test, test_preds)
                test_mse = mean_squared_error(y_test, test_preds)
                test_mae = mean_absolute_error(y_test, test_preds)

                model_id = f"{fs_name}_{model_name}_Top{k}"
                combined_preds[f'pred_{model_id}'] = test_preds
                cv_train_preds[f'pred_{model_id}'] = fold_preds

                joblib.dump(hpo_results["best_model"], os.path.join(base_dirs["models"], f"Model_{model_id}.pkl"))

                with open(os.path.join(base_dirs["features"], f"Features_{model_id}.json"), 'w') as f:
                    json.dump(k_features, f)

                with open(os.path.join(base_dirs["hyperparams"], f"BestParams_{model_id}.json"), 'w') as f:
                    json.dump(hpo_results["best_params"], f)

                all_results.append({
                    'FS_Method': fs_name,
                    'Model': model_name,
                    'Features_Used': k,
                    'Best_Params': hpo_results["best_params"],
                    'CV_R2': cv_r2,
                    'CV_MSE': cv_mse,
                    'CV_MAE': cv_mae,
                    'Test_R2': test_r2,
                    'Test_MSE': test_mse,
                    'Test_MAE': test_mae
                })

    metrics_df = pd.DataFrame(all_results)
    metrics_df['R2_Rank'] = metrics_df['Test_R2'].rank(ascending=False, method='min')
    metrics_df = metrics_df.sort_values('R2_Rank')
    metrics_df.to_csv(os.path.join(base_dirs["metrics"], CONFIG['results_csv_path']), index=False)
    combined_preds.to_csv(os.path.join(base_dirs["predictions"], "all_predictions.csv"), index=False)
    cv_train_preds.to_csv(os.path.join(base_dirs["predictions"], "cv_train_predictions.csv"), index=False)

if __name__ == '__main__':
    run_pipeline()


Evaluating RF with RF feature selection...
  → HPO for RF with RF using top-1 features
  → HPO for RF with RF using top-2 features
  → HPO for RF with RF using top-3 features
  → HPO for RF with RF using top-4 features
  → HPO for RF with RF using top-5 features
  → HPO for RF with RF using top-6 features

Evaluating XGB with RF feature selection...
  → HPO for XGB with RF using top-1 features
  → HPO for XGB with RF using top-2 features
  → HPO for XGB with RF using top-3 features
  → HPO for XGB with RF using top-4 features
  → HPO for XGB with RF using top-5 features
  → HPO for XGB with RF using top-6 features
  → HPO for XGB with RF using top-7 features
  → HPO for XGB with RF using top-8 features

Evaluating RF with RFE_RF feature selection...
  → HPO for RF with RFE_RF using top-1 features
  → HPO for RF with RFE_RF using top-2 features
  → HPO for RF with RFE_RF using top-3 features
  → HPO for RF with RFE_RF using top-4 features
  → HPO for RF with RFE_RF using top-5 features

In [None]:
import pandas as pd
import numpy as np
import json
import random
import os
import re
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

# ---------------------
# Load config & setup output
# ---------------------
with open("config.json") as f:
    CONFIG = json.load(f)

dataset_slug = re.sub(r'\W+', '_', os.path.splitext(os.path.basename(CONFIG["dataset_path"]))[0])
output_dir = os.path.join("metrics", dataset_slug)
os.makedirs(output_dir, exist_ok=True)

# ---------------------
# Load prediction data
# ---------------------
cv_df = pd.read_csv(f"{CONFIG['predictions_dir']}/{dataset_slug}/cv_train_predictions.csv")
test_df = pd.read_csv(f"{CONFIG['predictions_dir']}/{dataset_slug}/all_predictions.csv")
metrics_df = pd.read_csv(f"metrics/{dataset_slug}/{CONFIG['results_csv_path']}")

X_train = cv_df.drop(columns=["true"])
y_train = cv_df["true"]
X_test = test_df.drop(columns=["true"])
y_test = test_df["true"]
model_columns = X_train.columns.tolist()

# ---------------------
# Feature Selection Methods
# ---------------------
def fs_rf(X, y, n):
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X, y)
    scores = rf.feature_importances_
    return X.columns[np.argsort(scores)[::-1][:n]]

def fs_rfe_rf(X, y, n):
    rfe = RFE(estimator=RandomForestRegressor(random_state=42), n_features_to_select=n)
    rfe.fit(X, y)
    return X.columns[rfe.support_]

def fs_xgb(X, y, n):
    xgb = XGBRegressor(random_state=42)
    xgb.fit(X, y)
    scores = xgb.feature_importances_
    return X.columns[np.argsort(scores)[::-1][:n]]

def fs_rf_xgb(X, y, n):
    rf = RandomForestRegressor(random_state=42)
    xgb = XGBRegressor(random_state=42)
    rf.fit(X, y)
    xgb.fit(X, y)
    avg_scores = (rf.feature_importances_ + xgb.feature_importances_) / 2
    return X.columns[np.argsort(avg_scores)[::-1][:n]]

fs_methods = {
    "RF": fs_rf,
    "RFE_RF": fs_rfe_rf,
    "XGB": fs_xgb,
    "RF_XGB": fs_rf_xgb
}

# Meta-models
meta_models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=CONFIG["random_state"]),
    "XGBoost": XGBRegressor(random_state=CONFIG["random_state"])
}

# ---------------------
# Loop over Top-K values
# ---------------------
max_models = len(metrics_df)
step = 10
k_values = list(range(10, max_models, step))
if max_models not in k_values:
    k_values.append(max_models)

best_results = []

print("\n=== Evaluating Methods 1, 2, and 3 across Top-K values ===")

for k in k_values:
    print(f"\n========== Top-{k} Models ==========")

    # ---------------------
    # Method 1: Random Averaging
    # ---------------------
    random_cols = random.sample(model_columns, k)
    y_pred_test_random = X_test[random_cols].mean(axis=1)
    r2_random = r2_score(y_test, y_pred_test_random)
    print(f"Method 1 - Random {k}: R2 = {r2_random:.4f}")

    best_results.append({
        "Dataset": dataset_slug,
        "Method": "RandomAverage",
        "Top_K": k,
        "Meta_Model": "None",
        "FS_Method": "None",
        "Best_n": k,
        "R2": r2_random,
        "MSE": mean_squared_error(y_test, y_pred_test_random),
        "MAE": mean_absolute_error(y_test, y_pred_test_random),
        "Selected_Features": random_cols
    })

    # ---------------------
    # Method 2: Top-K Averaging
    # ---------------------
    top_k_models = metrics_df.sort_values("CV_R2", ascending=False).head(k)
    top_k_cols = [
        f"pred_{row['FS_Method']}_{row['Model']}_Top{int(row['Features_Used'])}"
        for _, row in top_k_models.iterrows()
    ]

    y_pred_test_top = X_test[top_k_cols].mean(axis=1)
    r2_top = r2_score(y_test, y_pred_test_top)
    print(f"Method 2 - Top {k}: R2 = {r2_top:.4f}")

    best_results.append({
        "Dataset": dataset_slug,
        "Method": "TopKAverage",
        "Top_K": k,
        "Meta_Model": "None",
        "FS_Method": "None",
        "Best_n": k,
        "R2": r2_top,
        "MSE": mean_squared_error(y_test, y_pred_test_top),
        "MAE": mean_absolute_error(y_test, y_pred_test_top),
        "Selected_Features": top_k_cols
    })

    # ---------------------
    # Method 3: FS + Meta Models
    # ---------------------
    X_train_top = X_train[top_k_cols]
    X_test_top = X_test[top_k_cols]

    for fs_name, fs_func in fs_methods.items():
        for model_name, model in meta_models.items():
            best_r2 = -np.inf
            best_n = None
            best_metrics = {}

            for n in range(2, k + 1):
                if n > len(top_k_cols):
                    break
                selected_feats = fs_func(X_train_top, y_train, n=n)
                X_train_fs = X_train_top[selected_feats]
                X_test_fs = X_test_top[selected_feats]

                model.fit(X_train_fs, y_train)
                y_pred = model.predict(X_test_fs)

                r2 = r2_score(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                mae = mean_absolute_error(y_test, y_pred)

                if r2 > best_r2:
                    best_r2 = r2
                    best_n = n
                    best_metrics = {
                        "R2": r2,
                        "MSE": mse,
                        "MAE": mae,
                        "Selected_Features": list(selected_feats)
                    }

            print(f"Method 3 - Top-{k} → {fs_name} + {model_name}: Best R2 = {best_r2:.4f} (n={best_n})")

            best_results.append({
                "Dataset": dataset_slug,
                "Method": "FS_MetaModel",
                "Top_K": k,
                "Meta_Model": model_name,
                "FS_Method": fs_name,
                "Best_n": best_n,
                **best_metrics
            })

# ---------------------
# Save Results
# ---------------------
results_df = pd.DataFrame(best_results)
results_df.to_csv(os.path.join(output_dir, "best_meta_model_results.csv"), index=False)
print(f"\n Saved results to: {output_dir}/best_meta_model_results.csv")



=== Evaluating Methods 1, 2, and 3 across Top-K values ===

Method 1 - Random 10: R2 = 0.8529
Method 2 - Top 10: R2 = 0.8552
Method 3 - Top-10 → RF + LinearRegression: Best R2 = 0.8515 (n=9)
Method 3 - Top-10 → RF + RandomForest: Best R2 = 0.7411 (n=6)
Method 3 - Top-10 → RF + XGBoost: Best R2 = 0.7585 (n=10)
Method 3 - Top-10 → RFE_RF + LinearRegression: Best R2 = 0.8515 (n=9)
Method 3 - Top-10 → RFE_RF + RandomForest: Best R2 = 0.7386 (n=7)
Method 3 - Top-10 → RFE_RF + XGBoost: Best R2 = 0.7566 (n=9)
Method 3 - Top-10 → XGB + LinearRegression: Best R2 = 0.8592 (n=8)
Method 3 - Top-10 → XGB + RandomForest: Best R2 = 0.7995 (n=2)
Method 3 - Top-10 → XGB + XGBoost: Best R2 = 0.7864 (n=2)
Method 3 - Top-10 → RF_XGB + LinearRegression: Best R2 = 0.8535 (n=2)
Method 3 - Top-10 → RF_XGB + RandomForest: Best R2 = 0.7995 (n=2)
Method 3 - Top-10 → RF_XGB + XGBoost: Best R2 = 0.7864 (n=2)

Method 1 - Random 20: R2 = 0.8547
Method 2 - Top 20: R2 = 0.8522
Method 3 - Top-20 → RF + LinearRegressio