In [35]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)
os.makedirs("hyperparams", exist_ok=True)

def load_data():
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                            test_size=CONFIG["test_size"],
                            random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    rankings = {}

    rf = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5,
                                random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rf_scores = rf.feature_importances_
    rankings['RF'] = list(X_train.columns[np.argsort(rf_scores)[::-1]])

    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]),
              n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rfe_scores = rfe.ranking_
    rankings['RFE_RF'] = list(X_train.columns[np.argsort(rfe_scores)])

    xgb = XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.1,
                       random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    xgb_scores = xgb.feature_importances_
    rankings['XGB'] = list(X_train.columns[np.argsort(xgb_scores)[::-1]])

    hybrid_scores = (rf_scores + xgb_scores) / 2
    rankings['RF_XGB'] = list(X_train.columns[np.argsort(hybrid_scores)[::-1]])

    return rankings

def get_model_config(model_name):
    if model_name == 'RF':
        model_class = RandomForestRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [CONFIG["random_state"]]
        }
    elif model_name == 'XGB':
        model_class = XGBRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'random_state': [CONFIG["random_state"]]
        }
    return model_class, param_grid

def tune_hyperparameters(model, param_grid, X, y):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=CONFIG["metric"],
        cv=CONFIG["cv_folds"],
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X, y)
    r2 = np.max(grid_search.cv_results_["mean_test_score"])
    return {
        "best_model": grid_search.best_estimator_,
        "best_params": grid_search.best_params_,
        "r2": r2
    }

def run_pipeline():
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    all_results = []
    combined_preds = pd.DataFrame({'true': y_test.reset_index(drop=True)})
    cv_train_preds = pd.DataFrame({'true': y_train.reset_index(drop=True)})

    for fs_name, features in feature_rankings.items():
        for model_name, Model in [('RF', RandomForestRegressor), ('XGB', XGBRegressor)]:
            print(f"\nEvaluating {model_name} with {fs_name} feature selection...")

            best_k = None
            best_cv_r2 = -np.inf

            # Step 1: Find best k using CV
            for k in range(1, len(features) + 1):
                k_features = features[:k]
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                avg_r2 = np.mean(fold_r2)

                if avg_r2 > best_cv_r2:
                    best_cv_r2 = avg_r2
                    best_k = k

            # Step 2: HPO for spaced subset sizes [5, 10, ..., best_k]
            subset_ks = [k for k in range(5, best_k + 1, 5)]
            if best_k not in subset_ks:
                subset_ks.append(best_k)

            for k in subset_ks:
                k_features = features[:k]
                fold_preds = np.zeros(len(y_train))
                fold_r2 = []

                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][k_features], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][k_features])
                    fold_preds[val_idx] = preds
                    fold_r2.append(r2_score(y_train.iloc[val_idx], preds))

                avg_r2 = np.mean(fold_r2)

                model_class, param_grid = get_model_config(model_name)
                print(f"  → HPO for {model_name} with {fs_name} using top-{k} features")
                hpo_results = tune_hyperparameters(
                    model_class(random_state=CONFIG["random_state"]),
                    param_grid,
                    X_train[k_features],
                    y_train
                )

                test_preds = hpo_results["best_model"].predict(X_test[k_features])
                test_r2 = r2_score(y_test, test_preds)
                test_mse = mean_squared_error(y_test, test_preds)
                test_mae = mean_absolute_error(y_test, test_preds)

                model_id = f"{fs_name}_{model_name}_Top{k}"
                combined_preds[f'pred_{model_id}'] = test_preds
                cv_train_preds[f'pred_{model_id}'] = fold_preds

                joblib.dump(hpo_results["best_model"], f"models/Model_{model_id}.pkl")

                with open(f"features/Features_{model_id}.json", 'w') as f:
                    json.dump(k_features, f)

                with open(f"hyperparams/BestParams_{model_id}.json", 'w') as f:
                    json.dump(hpo_results["best_params"], f)

                all_results.append({
                    'FS_Method': fs_name,
                    'Model': model_name,
                    'Features_Used': k,
                    'Best_Params': hpo_results["best_params"],
                    'CV_R2': avg_r2,
                    'Test_R2': test_r2,
                    'Test_MSE': test_mse,
                    'Test_MAE': test_mae
                })

    metrics_df = pd.DataFrame(all_results)
    metrics_df['R2_Rank'] = metrics_df['Test_R2'].rank(ascending=False, method='min')
    metrics_df = metrics_df.sort_values('R2_Rank')
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)
    combined_preds.to_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv", index=False)
    cv_train_preds.to_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv", index=False)

if __name__ == '__main__':
    run_pipeline()



Evaluating RF with RF feature selection...
  → HPO for RF with RF using top-5 features
  → HPO for RF with RF using top-10 features
  → HPO for RF with RF using top-15 features
  → HPO for RF with RF using top-16 features

Evaluating XGB with RF feature selection...
  → HPO for XGB with RF using top-5 features
  → HPO for XGB with RF using top-10 features
  → HPO for XGB with RF using top-15 features
  → HPO for XGB with RF using top-20 features
  → HPO for XGB with RF using top-25 features
  → HPO for XGB with RF using top-30 features
  → HPO for XGB with RF using top-35 features
  → HPO for XGB with RF using top-36 features

Evaluating RF with RFE_RF feature selection...
  → HPO for RF with RFE_RF using top-5 features
  → HPO for RF with RFE_RF using top-10 features
  → HPO for RF with RFE_RF using top-15 features
  → HPO for RF with RFE_RF using top-20 features

Evaluating XGB with RFE_RF feature selection...
  → HPO for XGB with RFE_RF using top-5 features
  → HPO for XGB with RFE

In [41]:
stage2_test = pd.read_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv")
Y_test = stage2_test["true"]
X_test = stage2_test.iloc[:, 1:]
Y_pred_test = [np.mean(X_test.iloc[i, :]) for i in range(X_test.shape[0])]

mae_test = mean_absolute_error(Y_test, Y_pred_test)
mse_test = mean_squared_error(Y_test, Y_pred_test)
r2_test = r2_score(Y_test, Y_pred_test)

print("TEST MAE:" ,mae_test)
print("TEST MSE:", mse_test)
print("TEST R2 Score:", r2_test)"

TEST MAE: 49.00236150419841
TEST MSE: 4743.343350595725
TEST R2 Score: 0.7756553430950164
