In [17]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

def load_data():
    """Load and preprocess data"""
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                          test_size=CONFIG["test_size"],
                          random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    """Get feature rankings using all 4 FS methods"""
    rankings = {}
    
    # Random Forest
    rf = RandomForestRegressor(n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rankings['RF'] = rf.feature_importances_
    
    # RFE-RF
    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]), 
             n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rankings['RFE_RF'] = rfe.ranking_
    
    # XGBoost
    xgb = XGBRegressor(n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    rankings['XGB'] = xgb.feature_importances_
    
    # RF-XGB Hybrid
    hybrid = (rankings['RF'] + rankings['XGB']) / 2
    rankings['RF_XGB'] = hybrid
    
    return rankings

def run_pipeline():
    """Main execution pipeline"""
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    results = []
    combined_preds = pd.DataFrame({'true': y_test.reset_index(drop=True)})
    
    # Initialize DataFrame to store CV predictions for training data
    cv_train_preds = pd.DataFrame({'true': y_train.reset_index(drop=True)})
    
    for fs_name, scores in feature_rankings.items():
        features = X_train.columns[np.argsort(scores)[::-1]]
        
        for model_name, Model in [('RF', RandomForestRegressor), 
                                 ('XGB', XGBRegressor)]:
            best_metrics = {'cv_r2': -np.inf, 'cv_mse': np.inf, 'cv_mae': np.inf}
            best_k = 0
            
            # Initialize array to store CV predictions for this model
            cv_preds = np.zeros(len(y_train))
            
            # Feature subset evaluation
            for k in range(1, len(features) + 1):
                cv_r2, cv_mse, cv_mae = [], [], []
                
                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][features[:k]], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][features[:k]])
                    
                    # Store predictions for this fold
                    cv_preds[val_idx] = preds
                    
                    # Calculate metrics
                    cv_r2.append(r2_score(y_train.iloc[val_idx], preds))
                    cv_mse.append(mean_squared_error(y_train.iloc[val_idx], preds))
                    cv_mae.append(mean_absolute_error(y_train.iloc[val_idx], preds))
                
                mean_r2 = np.mean(cv_r2)
                mean_mse = np.mean(cv_mse)
                mean_mae = np.mean(cv_mae)
                
                # Use configured metric for selection
                if CONFIG["metric"] == 'r2' and mean_r2 > best_metrics['cv_r2']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
                elif CONFIG["metric"] in ['mse', 'neg_mean_squared_error'] and mean_mse < best_metrics['cv_mse']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
                elif CONFIG["metric"] == 'mae' and mean_mae < best_metrics['cv_mae']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
            
            # Save CV predictions for this model
            model_id = f"{fs_name}_{model_name}"
            cv_train_preds[f'pred_{model_id}'] = cv_preds
            
            # Final model training
            final_model = Model(random_state=CONFIG["random_state"])
            final_model.fit(X_train[features[:best_k]], y_train)
            
            # Generate predictions
            test_pred = final_model.predict(X_test[features[:best_k]])
            
            # Save artifacts
            joblib.dump(final_model, f'models/Model_{model_id}.pkl')
            with open(f'features/Features_{model_id}.json', 'w') as f:
                json.dump(features[:best_k].tolist(), f)
            
            # Store results and predictions
            combined_preds[f'pred_{model_id}'] = test_pred
            test_metrics = {
                'test_r2': r2_score(y_test, test_pred),
                'test_mse': mean_squared_error(y_test, test_pred),
                'test_mae': mean_absolute_error(y_test, test_pred)
            }
            
            results.append({
                'FS_Method': fs_name,
                'Model': model_name,
                'CV_R2': best_metrics['cv_r2'],
                'CV_MSE': best_metrics['cv_mse'],
                'CV_MAE': best_metrics['cv_mae'],
                **test_metrics,
                'Features_Used': best_k
            })
    
    # Save all outputs
    metrics_df = pd.DataFrame(results)
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)
    combined_preds.to_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv", index=False)
    
    # Save CV predictions for training data
    cv_train_preds.to_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv", index=False)

if __name__ == '__main__':
    run_pipeline()

In [19]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)
os.makedirs("hyperparams", exist_ok=True)

def load_data():
    """Load and preprocess data"""
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                            test_size=CONFIG["test_size"],
                            random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    """Get feature rankings using all 4 FS methods"""
    rankings = {}

    # Random Forest
    rf = RandomForestRegressor(n_estimators=200,
                                max_depth=15,
                                min_samples_split=5,
                                random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rankings['RF'] = rf.feature_importances_

    # RFE-RF
    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]), 
              n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rankings['RFE_RF'] = rfe.ranking_

    # XGBoost
    xgb = XGBRegressor(n_estimators=300,
                       max_depth=5,
                       learning_rate=0.1,
                       random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    rankings['XGB'] = xgb.feature_importances_

    # RF-XGB Hybrid
    hybrid = (rankings['RF'] + rankings['XGB']) / 2
    rankings['RF_XGB'] = hybrid

    return rankings

def get_model_config(model_name):
    """Return model-specific configuration"""
    if model_name == 'RF':
        model_class = RandomForestRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [CONFIG["random_state"]]
        }
    elif model_name == 'XGB':
        model_class = XGBRegressor
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'random_state': [CONFIG["random_state"]]
        }
    return model_class, param_grid

def tune_hyperparameters(model, param_grid, X, y):
    """Perform hyperparameter tuning using GridSearchCV and return essential info only"""
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=CONFIG["metric"],
        cv=CONFIG["cv_folds"],
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X, y)
    preds = grid_search.predict(X)
    return {
        "best_model": grid_search.best_estimator_,
        "best_params": grid_search.best_params_,
        "r2": r2_score(y, preds),
        "mse": mean_squared_error(y, preds),
        "mae": mean_absolute_error(y, preds)
    }

def run_pipeline():
    """Main execution pipeline"""
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    subset_sizes = CONFIG.get("subset_sizes", [5, 10, 15])
    
    all_results = []

    for fs_name, scores in feature_rankings.items():
        sorted_features = X_train.columns[np.argsort(scores)[::-1]]

        for model_name in ['RF', 'XGB']:
            model_class, param_grid = get_model_config(model_name)

            for k in subset_sizes:
                if k > len(sorted_features):
                    continue
                
                selected_features = sorted_features[:k]
                model_instance = model_class(random_state=CONFIG["random_state"])

                print(f"Tuning {model_name} with {fs_name} using top-{k} features...")
                results = tune_hyperparameters(model_instance, param_grid,
                                               X_train[selected_features],
                                               y_train)

                model_id = f"{fs_name}_{model_name}_Top{k}"
                joblib.dump(results["best_model"], f"models/Model_{model_id}.pkl")

                with open(f"features/Features_{model_id}.json", 'w') as f:
                    json.dump(selected_features.tolist(), f)
                with open(f"hyperparams/BestParams_{model_id}.json", 'w') as f:
                    json.dump(results["best_params"], f)

                all_results.append({
                    'FS_Method': fs_name,
                    'Model': model_name,
                    'Features_Used': k,
                    'Best_Params': results["best_params"],
                    'R2': results["r2"],
                    'MSE': results["mse"],
                    'MAE': results["mae"]
                })

    # Compile metrics and rank by R2
    metrics_df = pd.DataFrame(all_results)
    metrics_df['R2_Rank'] = metrics_df['R2'].rank(ascending=False, method='min')
    metrics_df = metrics_df.sort_values('R2_Rank')
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)

if __name__ == '__main__':
    run_pipeline()


Tuning RF with RF using top-5 features...
Tuning RF with RF using top-10 features...
Tuning RF with RF using top-15 features...
Tuning XGB with RF using top-5 features...
Tuning XGB with RF using top-10 features...
Tuning XGB with RF using top-15 features...
Tuning RF with RFE_RF using top-5 features...
Tuning RF with RFE_RF using top-10 features...
Tuning RF with RFE_RF using top-15 features...
Tuning XGB with RFE_RF using top-5 features...
Tuning XGB with RFE_RF using top-10 features...
Tuning XGB with RFE_RF using top-15 features...
Tuning RF with XGB using top-5 features...
Tuning RF with XGB using top-10 features...
Tuning RF with XGB using top-15 features...
Tuning XGB with XGB using top-5 features...
Tuning XGB with XGB using top-10 features...
Tuning XGB with XGB using top-15 features...
Tuning RF with RF_XGB using top-5 features...
Tuning RF with RF_XGB using top-10 features...
Tuning RF with RF_XGB using top-15 features...
Tuning XGB with RF_XGB using top-5 features...
Tuning

In [4]:
import pandas as pd
import numpy as np
import json
import joblib
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.preprocessing import StandardScaler

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

# Load second-stage training data (predictions from first-stage models)
cv_train_preds = pd.read_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv")

# Prepare features (X) and target (y)
X = cv_train_preds.drop(columns=['true'])  # Use model predictions as features
y = cv_train_preds['true']  # True target values

# Split into second-stage Training (80%), Validation (10%), and Test (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=CONFIG["random_state"], shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=CONFIG["random_state"], shuffle=True)

# Standardize features (fit only on train, transform val & test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### --- Feature Selection for Second Stage --- ###
def select_features_second_stage(X_train, y_train):
    selected_features = {}

    # 1. Random Forest Feature Selection
    rf_selector = RandomForestRegressor(n_estimators=100, random_state=CONFIG["random_state"])
    rf_selector.fit(X_train, y_train)
    feature_importances = rf_selector.feature_importances_
    selected_features["RF"] = X_train.columns[feature_importances > np.percentile(feature_importances, 50)]

    # 2. RFE with Random Forest
    rfe_selector = RFE(RandomForestRegressor(n_estimators=50, random_state=CONFIG["random_state"]), n_features_to_select=10)
    rfe_selector.fit(X_train, y_train)
    selected_features["RFE"] = X_train.columns[rfe_selector.support_]

    # 3. Mutual Information-based Selection
    mi_scores = mutual_info_regression(X_train, y_train)
    selected_features["Mutual_Info"] = X_train.columns[mi_scores > np.percentile(mi_scores, 50)]

    # 4. LASSO Feature Selection
    lasso = LassoCV(cv=5, random_state=CONFIG["random_state"])
    lasso.fit(X_train, y_train)
    selected_features["LASSO"] = X_train.columns[np.abs(lasso.coef_) > 0]

    # Save selected features
    with open("metrics/selected_features_second_stage.json", "w") as f:
        json.dump({k: list(v) for k, v in selected_features.items()}, f)

    return selected_features

# Select features using the best method (Random Forest in this case)
selected_features_second_stage = select_features_second_stage(X_train, y_train)

# Apply selected features to train, validation, and test sets
X_train_selected = X_train[selected_features_second_stage["RF"]]
X_val_selected = X_val[selected_features_second_stage["RF"]]
X_test_selected = X_test[selected_features_second_stage["RF"]]

### --- Model Training & Evaluation --- ###
def cross_val_metrics(model, X, y, kf):
    """Performs cross-validation and returns R2, MAE, and MSE scores."""
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    mae_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    return np.mean(r2_scores), np.mean(mae_scores), np.mean(mse_scores)

def train_and_evaluate_models(X_train, X_val, y_train, y_val):
    """Trains models, evaluates on validation set, and returns trained models with metrics."""
    models = {
        "Random Forest": RandomForestRegressor(random_state=CONFIG["random_state"]),
        "Gradient Boosting": GradientBoostingRegressor(random_state=CONFIG["random_state"]),
        "Linear Regression": LinearRegression()
    }

    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    
    rf_model = GridSearchCV(RandomForestRegressor(random_state=CONFIG["random_state"]), param_grid, cv=5, scoring='r2')

    kf = KFold(n_splits=CONFIG["cv_folds"], shuffle=True, random_state=CONFIG["random_state"])
    results = []

    trained_models = {}
    for model_name, model in models.items():
        # Cross-validation scores
        cv_r2, cv_mae, cv_mse = cross_val_metrics(model, X_train, y_train, kf)

        # Train the model
        model.fit(X_train, y_train)
        trained_models[model_name] = model

        # Validation scores
        val_preds = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, val_preds)
        val_mse = mean_squared_error(y_val, val_preds)
        val_r2 = r2_score(y_val, val_preds)

        # Store results
        results.append({
            'Model': model_name,
            'CV R2': cv_r2,
            'CV MAE': cv_mae,
            'CV MSE': cv_mse,
            'Validation R2': val_r2,
            'Validation MAE': val_mae,
            'Validation MSE': val_mse
        })

    return trained_models, pd.DataFrame(results)

# Train models and evaluate on validation set
trained_models, results_df = train_and_evaluate_models(X_train_selected, X_val_selected, y_train, y_val)

# Save validation results
results_df.to_csv("metrics/second_stage_model_results.csv", index=False)

### --- Test Set Evaluation --- ###
def evaluate_on_test(models, X_test, y_test):
    """Evaluates trained models on test data and saves results."""
    test_results = []
    for model_name, model in models.items():
        test_preds = model.predict(X_test)
        test_r2 = r2_score(y_test, test_preds)
        test_mae = mean_absolute_error(y_test, test_preds)
        test_mse = mean_squared_error(y_test, test_preds)

        test_results.append({
            'Model': model_name,
            'Test R2': test_r2,
            'Test MAE': test_mae,
            'Test MSE': test_mse
        })

    test_results_df = pd.DataFrame(test_results)
    test_results_df.to_csv("metrics/second_stage_test_results.csv", index=False)

# Evaluate models on test data
evaluate_on_test(trained_models, X_test_selected, y_test)

### --- Save Final Models --- ###
os.makedirs("models", exist_ok=True)
for model_name, model in trained_models.items():
    joblib.dump(model, f"models/{model_name.replace(' ', '_').lower()}_second_stage.pkl")

print("Second-stage feature selection, model training, and test evaluation completed successfully. Results saved.")


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Second-stage feature selection, model training, and test evaluation completed successfully. Results saved.


In [7]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

# Load configuration
with open('config.json') as f:
    CONFIG = json.load(f)

kf=KFold(n_splits=5, random_state=42, shuffle=True)
stage2df = pd.read_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv")
Y2=stage2df["true"]
X2=stage2df.iloc[:,1:]
#lr=RandomForestRegressor(random_state=42)
Ypred = [np.mean(X2.iloc[i,:]) for i in range(X2.shape[0])]
# Calculate metrics
mae = mean_absolute_error(Y2, Ypred)
mse = mean_squared_error(Y2, Ypred)
r2 = r2_score(Y2, Ypred)
print("MAE:" ,mae)
print("mse:",mse)
print("R2 Score:",r2)

MAE: 57.51619644516473
mse: 5568.921245448621
R2 Score: 0.7051408310226894
