In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS
)

ALGORITHM_NAME = "Boruta"
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: Boruta


In [15]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [16]:

print(f"\n--- Applying Boruta Algorithm ---")

# Define Random Forest as the base estimator for Boruta
# CRITICAL: Use shallow trees (max_depth=5-7) for better feature importance estimates
# Use many estimators and jobs=-1 for speed
rf_estimator = RandomForestClassifier(
    n_estimators=100,  # Boruta will adjust this with 'auto'
    max_depth=5,       # Shallow trees recommended by Boruta paper
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # Handle class imbalance
)

# Initialize Boruta
# perc=100 means use max of shadow features (most stringent, default)
# Lower perc (e.g., 90) will select more features but increase false positives
# max_iter=100 is usually enough; increase if features remain tentative
feat_selector = BorutaPy(
    estimator=rf_estimator,
    n_estimators='auto',      # Automatically determine n_estimators per iteration
    perc=25,                 # Use 100th percentile (max) of shadow features
    alpha=0.001,               # Significance level
    two_step=True,            # Use improved two-step correction
    max_iter=100,             # Maximum iterations
    random_state=42,
    verbose=2                 # Show progress: 2=detailed, 1=iteration only, 0=silent
)

print("\nRunning Boruta (this may take several minutes)...")
print("Boruta will iteratively compare real features against shadow features.\n")

# Fit Boruta
# Note: Boruta needs numpy arrays, not DataFrames
feat_selector.fit(X_train_encoded, y_train)

print("\n--- Boruta Results ---")

# Get confirmed features (ranking=1)
confirmed_mask = feat_selector.support_
confirmed_features = [all_encoded_features[i] for i, confirmed in enumerate(confirmed_mask) if confirmed]

# Get tentative features (ranking=2) - features that showed promise but didn't confirm
tentative_mask = feat_selector.support_weak_
tentative_features = [all_encoded_features[i] for i, tentative in enumerate(tentative_mask) if tentative]

# For the study, we'll use confirmed + tentative features
# This is the "weak" selection in Boruta terminology
selected_mask = confirmed_mask | tentative_mask
selected_features = [all_encoded_features[i] for i, selected in enumerate(selected_mask) if selected]

print(f"\nFeature Selection Summary:")
print(f"  Confirmed features: {len(confirmed_features)}")
print(f"  Tentative features: {len(tentative_features)}")
print(f"  Total selected: {len(selected_features)}")
print(f"  Rejected features: {len(all_encoded_features) - len(selected_features)}")

# Transform data using selected features
X_train_selected = X_train_encoded[:, selected_mask]
X_test_selected = X_test_encoded[:, selected_mask]


--- Applying Boruta Algorithm ---

Running Boruta (this may take several minutes)...
Boruta will iteratively compare real features against shadow features.

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	14 / 100
Confirmed: 

In [17]:
print(f"\n--- Boruta Feature Rankings ---")

# Create DataFrame with feature rankings
# Ranking: 1=confirmed, 2=tentative, 3+=rejected (higher = less important)
feature_rankings = pd.DataFrame({
    'feature': all_encoded_features,
    'ranking': feat_selector.ranking_,
    'confirmed': confirmed_mask,
    'tentative': tentative_mask,
    'selected': selected_mask
}).sort_values('ranking')

print("\nTop 20 features (confirmed + tentative):")
print(feature_rankings[feature_rankings['selected']].head(20)[['feature', 'ranking']].to_string(index=False))

print("\n\nTop 20 rejected features (by importance):")
rejected = feature_rankings[~feature_rankings['selected']].head(20)
print(rejected[['feature', 'ranking']].to_string(index=False))

# Save rankings to CSV
feature_rankings.to_csv('boruta_feature_rankings.csv', index=False)
print("\nâœ“ Saved detailed rankings to 'boruta_feature_rankings.csv'")


--- Boruta Feature Rankings ---

Top 20 features (confirmed + tentative):
                 feature  ranking
        time_in_hospital        1
      num_lab_procedures        1
          num_procedures        1
         num_medications        1
       number_outpatient        1
        number_emergency        1
        number_inpatient        1
        number_diagnoses        1
             age_encoded        1
                    race        1
                  gender        1
       admission_type_id        1
discharge_disposition_id        1
     admission_source_id        1
                  change        1
             diabetesMed        1
          diag_1_grouped        1
               metformin        1
             repaglinide        1
             nateglinide        1


Top 20 rejected features (by importance):
                 feature  ranking
          chlorpropamide        2
                miglitol        3
     glipizide-metformin        4
              tolazamide       

In [18]:
print(f"\n--- Training Model on Boruta-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on Boruta-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.4212
  Precision (class 0): 0.8820
  Recall (class 0): 0.4013
  F1-score (class 0): 0.5516
  Precision (class 1): 0.1094
  Recall (class 1): 0.5779
  F1-score (class 1): 0.1839


In [19]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)

# Print summary stats for your paper
print(f"\nðŸ“Š Summary for Research Paper:")
print(f"   - Algorithm: Boruta (all-relevant feature selection)")
print(f"   - Base estimator: Random Forest (n_estimators=auto, max_depth=5)")
print(f"   - Features confirmed: {len(confirmed_features)}")
print(f"   - Features tentative: {len(tentative_features)}")
print(f"   - Total selected: {len(selected_features)} / {len(all_encoded_features)}")
print(f"   - Selection rate: {100*len(selected_features)/len(all_encoded_features):.1f}%")
print(f"   - Model accuracy: {report['accuracy']:.4f}")
print(f"   - F1-score (class 1): {report['1']['f1-score']:.4f}")



--- Saving Results ---

Saving results for: Boruta
âœ“ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
âœ“ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\boruta_results.json

  Original: 28/40 features
  Encoded:  28/40 features
  Accuracy: 0.4212
  F1 (class 1): 0.1839

Boruta COMPLETE

ðŸ“Š Summary for Research Paper:
   - Algorithm: Boruta (all-relevant feature selection)
   - Base estimator: Random Forest (n_estimators=auto, max_depth=5)
   - Features confirmed: 28
   - Features tentative: 0
   - Total selected: 28 / 40
   - Selection rate: 70.0%
   - Model accuracy: 0.4212
   - F1-score (class 1): 0.1839
