In [19]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)
os.makedirs("hyperparams", exist_ok=True)

def load_data():
    """Load and preprocess data"""
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                            test_size=CONFIG["test_size"],
                            random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    """Get feature rankings using all 4 FS methods"""
    rankings = {}

    # Random Forest
    rf = RandomForestRegressor(n_estimators=200,
                                max_depth=15,
                                min_samples_split=5,
                                random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rankings['RF'] = rf.feature_importances_

    # RFE-RF
    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]), 
              n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rankings['RFE_RF'] = rfe.ranking_

    # XGBoost
    xgb = XGBRegressor(n_estimators=300,
                       max_depth=5,
                       learning_rate=0.1,
                       random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    rankings['XGB'] = xgb.feature_importances_

    # RF-XGB Hybrid
    hybrid = (rankings['RF'] + rankings['XGB']) / 2
    rankings['RF_XGB'] = hybrid

    return rankings

def get_model_config(model_name):
    """Return model-specific configuration"""
    if model_name == 'RF':
        model_class = RandomForestRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [CONFIG["random_state"]]
        }
    elif model_name == 'XGB':
        model_class = XGBRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'random_state': [CONFIG["random_state"]]
        }
    return model_class, param_grid

def tune_hyperparameters(model, param_grid, X, y):
    """Perform hyperparameter tuning using GridSearchCV and return essential info only"""
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=CONFIG["metric"],
        cv=CONFIG["cv_folds"],
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X, y)
    preds = grid_search.predict(X)
    return {
        "best_model": grid_search.best_estimator_,
        "best_params": grid_search.best_params_,
        "r2": r2_score(y, preds),
        "mse": mean_squared_error(y, preds),
        "mae": mean_absolute_error(y, preds)
    }

def run_pipeline():
    """Main execution pipeline"""
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    subset_sizes = CONFIG.get("subset_sizes", [5, 10, 15])
    
    all_results = []

    for fs_name, scores in feature_rankings.items():
        sorted_features = X_train.columns[np.argsort(scores)[::-1]]

        for model_name in ['RF', 'XGB']:
            model_class, param_grid = get_model_config(model_name)

            for k in subset_sizes:
                if k > len(sorted_features):
                    continue
                
                selected_features = sorted_features[:k]
                model_instance = model_class(random_state=CONFIG["random_state"])

                print(f"Tuning {model_name} with {fs_name} using top-{k} features...")
                results = tune_hyperparameters(model_instance, param_grid,
                                               X_train[selected_features],
                                               y_train)

                model_id = f"{fs_name}_{model_name}_Top{k}"
                joblib.dump(results["best_model"], f"models/Model_{model_id}.pkl")

                with open(f"features/Features_{model_id}.json", 'w') as f:
                    json.dump(selected_features.tolist(), f)
                with open(f"hyperparams/BestParams_{model_id}.json", 'w') as f:
                    json.dump(results["best_params"], f)

                all_results.append({
                    'FS_Method': fs_name,
                    'Model': model_name,
                    'Features_Used': k,
                    'Best_Params': results["best_params"],
                    'R2': results["r2"],
                    'MSE': results["mse"],
                    'MAE': results["mae"]
                })

    # Compile metrics and rank by R2
    metrics_df = pd.DataFrame(all_results)
    metrics_df['R2_Rank'] = metrics_df['R2'].rank(ascending=False, method='min')
    metrics_df = metrics_df.sort_values('R2_Rank')
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)

if __name__ == '__main__':
    run_pipeline()


Tuning RF with RF using top-5 features...
Tuning RF with RF using top-10 features...
Tuning RF with RF using top-15 features...
Tuning XGB with RF using top-5 features...
Tuning XGB with RF using top-10 features...
Tuning XGB with RF using top-15 features...
Tuning RF with RFE_RF using top-5 features...
Tuning RF with RFE_RF using top-10 features...
Tuning RF with RFE_RF using top-15 features...
Tuning XGB with RFE_RF using top-5 features...
Tuning XGB with RFE_RF using top-10 features...
Tuning XGB with RFE_RF using top-15 features...
Tuning RF with XGB using top-5 features...
Tuning RF with XGB using top-10 features...
Tuning RF with XGB using top-15 features...
Tuning XGB with XGB using top-5 features...
Tuning XGB with XGB using top-10 features...
Tuning XGB with XGB using top-15 features...
Tuning RF with RF_XGB using top-5 features...
Tuning RF with RF_XGB using top-10 features...
Tuning RF with RF_XGB using top-15 features...
Tuning XGB with RF_XGB using top-5 features...
Tuning