In [1]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load configuration
with open('config-lidar.json') as f:
    CONFIG = json.load(f)

# Create required directories
os.makedirs(CONFIG["predictions_dir"], exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("features", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

def load_data():
    """Load and preprocess data"""
    data = pd.read_csv(CONFIG["dataset_path"])
    X = data.drop("Target", axis=1)
    y = data["Target"]
    return train_test_split(X, y, 
                          test_size=CONFIG["test_size"],
                          random_state=CONFIG["random_state"])

def get_feature_rankings(X_train, y_train):
    """Get feature rankings using all 4 FS methods"""
    rankings = {}
    
    # Random Forest
    rf = RandomForestRegressor(n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        random_state=CONFIG["random_state"])
    rf.fit(X_train, y_train)
    rankings['RF'] = rf.feature_importances_
    
    # RFE-RF
    rfe = RFE(estimator=RandomForestRegressor(random_state=CONFIG["random_state"]), 
             n_features_to_select=1)
    rfe.fit(X_train, y_train)
    rankings['RFE_RF'] = rfe.ranking_
    
    # XGBoost
    xgb = XGBRegressor(n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        random_state=CONFIG["random_state"])
    xgb.fit(X_train, y_train)
    rankings['XGB'] = xgb.feature_importances_
    
    # RF-XGB Hybrid
    hybrid = (rankings['RF'] + rankings['XGB']) / 2
    rankings['RF_XGB'] = hybrid
    
    return rankings

def run_pipeline():
    """Main execution pipeline"""
    X_train, X_test, y_train, y_test = load_data()
    feature_rankings = get_feature_rankings(X_train, y_train)
    results = []
    combined_preds = pd.DataFrame({'true': y_test.reset_index(drop=True)})
    
    # Initialize DataFrame to store CV predictions for training data
    cv_train_preds = pd.DataFrame({'true': y_train.reset_index(drop=True)})
    
    for fs_name, scores in feature_rankings.items():
        features = X_train.columns[np.argsort(scores)[::-1]]
        
        for model_name, Model in [('RF', RandomForestRegressor), 
                                 ('XGB', XGBRegressor)]:
            best_metrics = {'cv_r2': -np.inf, 'cv_mse': np.inf, 'cv_mae': np.inf}
            best_k = 0
            
            # Initialize array to store CV predictions for this model
            cv_preds = np.zeros(len(y_train))
            
            # Feature subset evaluation
            for k in range(1, len(features) + 1):
                cv_r2, cv_mse, cv_mae = [], [], []
                
                for train_idx, val_idx in KFold(CONFIG["cv_folds"]).split(X_train):
                    model = Model(random_state=CONFIG["random_state"])
                    model.fit(X_train.iloc[train_idx][features[:k]], y_train.iloc[train_idx])
                    preds = model.predict(X_train.iloc[val_idx][features[:k]])
                    
                    # Store predictions for this fold
                    cv_preds[val_idx] = preds
                    
                    # Calculate metrics
                    cv_r2.append(r2_score(y_train.iloc[val_idx], preds))
                    cv_mse.append(mean_squared_error(y_train.iloc[val_idx], preds))
                    cv_mae.append(mean_absolute_error(y_train.iloc[val_idx], preds))
                
                mean_r2 = np.mean(cv_r2)
                mean_mse = np.mean(cv_mse)
                mean_mae = np.mean(cv_mae)
                
                # Use configured metric for selection
                if CONFIG["metric"] == 'r2' and mean_r2 > best_metrics['cv_r2']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
                elif CONFIG["metric"] in ['mse', 'neg_mean_squared_error'] and mean_mse < best_metrics['cv_mse']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
                elif CONFIG["metric"] == 'mae' and mean_mae < best_metrics['cv_mae']:
                    best_metrics = {'cv_r2': mean_r2, 'cv_mse': mean_mse, 'cv_mae': mean_mae}
                    best_k = k
            
            # Save CV predictions for this model
            model_id = f"{fs_name}_{model_name}"
            cv_train_preds[f'dataset2_pred_{model_id}'] = cv_preds
            
            # Final model training
            final_model = Model(random_state=CONFIG["random_state"])
            final_model.fit(X_train[features[:best_k]], y_train)
            
            # Generate predictions
            test_pred = final_model.predict(X_test[features[:best_k]])
            
            # Save artifacts
            joblib.dump(final_model, f'models/Save_Model_{model_id}.pkl')
            with open(f'features/Model_Features_{model_id}.json', 'w') as f:
                json.dump(features[:best_k].tolist(), f)
            
            # Store results and predictions
            combined_preds[f'pred_{model_id}'] = test_pred
            test_metrics = {
                'test_r2': r2_score(y_test, test_pred),
                'test_mse': mean_squared_error(y_test, test_pred),
                'test_mae': mean_absolute_error(y_test, test_pred)
            }
            
            results.append({
                'FS_Method': fs_name,
                'Model': model_name,
                'CV_R2': best_metrics['cv_r2'],
                'CV_MSE': best_metrics['cv_mse'],
                'CV_MAE': best_metrics['cv_mae'],
                **test_metrics,
                'Features_Used': best_k
            })
    
    # Save all outputs
    metrics_df = pd.DataFrame(results)
    metrics_df.to_csv(f"metrics/{CONFIG['results_csv_path']}", index=False)
    combined_preds.to_csv(f"{CONFIG['predictions_dir']}/all_predictions.csv", index=False)
    
    # Save CV predictions for training data
    cv_train_preds.to_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv", index=False)

if __name__ == '__main__':
    run_pipeline()

In [14]:
import json
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

# Load configuration
with open('config-lidar.json') as f:
    CONFIG = json.load(f)

kf=KFold(n_splits=5, random_state=42, shuffle=True)
stage2df = pd.read_csv(f"{CONFIG['predictions_dir']}/cv_train_predictions.csv")
Y2=stage2df["true"]
X2=stage2df.iloc[:,1:]
#lr=RandomForestRegressor(random_state=42)
Ypred = [np.mean(X2.iloc[i,:]) for i in range(X2.shape[0])]
# Calculate metrics
mae = mean_absolute_error(Y2, Ypred)
mse = mean_squared_error(Y2, Ypred)
r2 = r2_score(Y2, Ypred)
print("MAE:" ,mae)
print("mse:",mse)
print("R2 Score:",r2)

MAE: 61.73211398859467
mse: 7704.17608841006
R2 Score: 0.7392772179537377
