In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS
)
import time

ALGORITHM_NAME = "SBS_Forward"
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: SBS_Forward


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying Sequential Forward Selection (Sklearn) ---")

# CRITICAL: Your FIFA notebook used LinearRegression (regression)
# For diabetes classification, use LogisticRegression

# Determine number of features to select
n_features_to_select = int(0.7 * len(all_encoded_features))

print(f"\nTarget: Select {n_features_to_select} out of {len(all_encoded_features)} features")
print("Direction: Forward (start with 0, add best feature iteratively)")

# Use a fast estimator for SBS
# LogisticRegression is much faster than SVC for large datasets
estimator = LogisticRegression(
    random_state=42,
    max_iter=1000,
    solver='lbfgs'  # Fast solver
)

# Initialize Sequential Feature Selector
# direction='forward' means start with 0 features, add best one at a time
# cv=3 means use 3-fold cross-validation to evaluate each feature addition
selector = SequentialFeatureSelector(
    estimator=estimator,
    n_features_to_select=n_features_to_select,
    direction='backward',  # Can also use 'backward'
    scoring='accuracy',   # Metric for classification
    cv=3,                 # Cross-validation folds
    n_jobs=-1             # Use all CPU cores
)

print(f"\nRunning SBS...")
print("This will evaluate ~{} feature combinations...".format(
    n_features_to_select * (len(all_encoded_features) - n_features_to_select//2)
))
print("⚠️  SBS is SLOW. This may take 10-30 minutes...\n")

start_time = time.time()
selector.fit(X_train_encoded, y_train)
elapsed = time.time() - start_time

print(f"\n✓ SBS completed in {elapsed/60:.2f} minutes")

# Transform data
X_train_selected = selector.transform(X_train_encoded)
X_test_selected = selector.transform(X_test_encoded)

# Get selected features
selected_mask = selector.get_support()
selected_features = [all_encoded_features[i] for i, selected in enumerate(selected_mask) if selected]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} features")
print(f"  Not selected: {len(all_encoded_features) - len(selected_features)} features")



--- Applying Sequential Forward Selection (Sklearn) ---

Target: Select 28 out of 40 features
Direction: Forward (start with 0, add best feature iteratively)

Running SBS...
This will evaluate ~728 feature combinations...
⚠️  SBS is SLOW. This may take 10-30 minutes...


✓ SBS completed in 5.37 minutes

Feature Selection Results:
  Selected: 28 features
  Not selected: 12 features


In [4]:
print(f"\n--- SBS Feature Analysis ---")

# Create DataFrame
feature_selection = pd.DataFrame({
    'feature': all_encoded_features,
    'selected': selected_mask
})

selected_df = feature_selection[feature_selection['selected']]
rejected_df = feature_selection[~feature_selection['selected']]

print(f"\nFirst 30 selected features:")
print(selected_df.head(30)['feature'].to_string(index=False))

print(f"\nFirst 20 rejected features:")
print(rejected_df.head(20)['feature'].to_string(index=False))

# Save
feature_selection.to_csv('SBS_feature_selection.csv', index=False)
print("\n✓ Saved selection results to 'SBS_feature_selection.csv'")


--- SBS Feature Analysis ---

First 30 selected features:
        time_in_hospital
      num_lab_procedures
          num_procedures
         num_medications
       number_outpatient
        number_emergency
        number_inpatient
        number_diagnoses
                    race
                  gender
       admission_type_id
discharge_disposition_id
     admission_source_id
             diabetesMed
          diag_1_grouped
               metformin
             nateglinide
               glyburide
            pioglitazone
                miglitol
              tolazamide
                 examide
             citoglipton
     glyburide-metformin
     glipizide-metformin
glimepiride-pioglitazone
 metformin-rosiglitazone
  metformin-pioglitazone

First 20 rejected features:
   age_encoded
        change
   repaglinide
chlorpropamide
   glimepiride
 acetohexamide
     glipizide
   tolbutamide
 rosiglitazone
      acarbose
  troglitazone
       insulin

✓ Saved selection results to 'S

In [5]:
print(f"\n--- Training Model on SBS-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")



--- Training Model on SBS-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.4037
  Precision (class 0): 0.8599
  Recall (class 0): 0.3916
  F1-score (class 0): 0.5381
  Precision (class 1): 0.0944
  Recall (class 1): 0.4984
  F1-score (class 1): 0.1587


In [6]:

print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: SBS_Forward
✓ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
✓ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\sbs_forward_results.json

  Original: 28/40 features
  Encoded:  28/40 features
  Accuracy: 0.4037
  F1 (class 1): 0.1587

SBS_Forward COMPLETE
