In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS,
    RANDOM_STATE
)
import time

ALGORITHM_NAME = "XGBoost_Importance"
# !! IMPORTANT: Update this path to your local file path !!
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: XGBoost_Importance


In [3]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [4]:
print(f"\n--- Applying {ALGORITHM_NAME} (SelectFromModel) ---")
start_time = time.time()

# --- 1. Calculate scale_pos_weight for imbalanced data ---
# This helps XGBoost pay more attention to the rare positive class (1)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Using scale_pos_weight for imbalance: {scale_pos_weight:.2f}")

# --- 2. Define the Estimator ---
# We use XGBClassifier for this classification problem
estimator = XGBClassifier(
    n_estimators=100,
    random_state=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    eval_metric='logloss'
)

# --- 3. Fit the model ---
print("Fitting XGBoost model to find feature importances...")
estimator.fit(X_train_encoded, y_train)

# --- 4. Initialize SelectFromModel ---
# Select features with importance > median importance
selector = SelectFromModel(estimator, prefit=True, max_features=28)

elapsed = (time.time() - start_time) / 60
print(f"✓ XGBoost fitting complete in {elapsed:.2f} minutes")

# --- 5. Get Results ---
X_train_selected = selector.transform(X_train_encoded)
X_test_selected = selector.transform(X_test_encoded)

selected_mask = selector.get_support()
selected_features = [all_encoded_features[i] for i, selected in enumerate(selected_mask) if selected]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} features (Threshold: 'median')")
print(f"  Eliminated: {len(all_encoded_features) - len(selected_features)} features")


--- Applying XGBoost_Importance (SelectFromModel) ---
Using scale_pos_weight for imbalance: 7.86
Fitting XGBoost model to find feature importances...
✓ XGBoost fitting complete in 0.03 minutes

Feature Selection Results:
  Selected: 20 features (Threshold: 'median')
  Eliminated: 20 features


In [5]:
print(f"\n--- Training Model on {ALGORITHM_NAME}-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on XGBoost_Importance-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.4118
  Precision (class 0): 0.8572
  Recall (class 0): 0.4044
  F1-score (class 0): 0.5495
  Precision (class 1): 0.0913
  Recall (class 1): 0.4704
  F1-score (class 1): 0.1529


In [6]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: XGBoost_Importance
✓ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
✓ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\xgboost_importance_results.json

  Original: 20/40 features
  Encoded:  20/40 features
  Accuracy: 0.4118
  F1 (class 1): 0.1529

XGBoost_Importance COMPLETE
