In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, RFECV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS,
    RANDOM_STATE
)
import time

ALGORITHM_NAME = "MSFS_Selector"
# !! IMPORTANT: Update this path to your local file path !!
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: MSFS_Selector


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying {ALGORITHM_NAME} (Ensemble Voting) ---")
print("WARNING: This is very slow and may take 30+ minutes.")

# Convert scaled arrays to DataFrames to keep feature names
X_train_scaled_df = pd.DataFrame(X_train_encoded, columns=all_encoded_features)
X_test_scaled_df = pd.DataFrame(X_test_encoded, columns=all_encoded_features)

feature_votes = defaultdict(int)

# 3 iterations is a minimum. Increase to 5 or 7 for a more stable result.
n_iterations = 3 
n_total_features = X_train_scaled_df.shape[1]

# KFold for RFECV (using stratified for classification)
cv_splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

start_time = time.time()
for i in range(n_iterations):
    print(f"\n  -> Iteration {i+1}/{n_iterations}...")
    
    # --- 1. Mutual Information Gain (MIG) ---
    print(f"     Running MIG...")
    mig = mutual_info_classif(X_train_scaled_df, y_train, random_state=RANDOM_STATE+i)
    mig_threshold = np.percentile(mig, 25) # Select top 50%
    mig_selected = X_train_scaled_df.columns[mig >= mig_threshold]
    print(f"     ...MIG selected {len(mig_selected)} features")

    # --- 2. RFECV with RandomForest ---
    print(f"     Running RFECV (this is the slowest part)...")
    rf = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE+i)
    # Use 'f1' scoring for imbalanced classification
    rfecv = RFECV(estimator=rf, step=1, cv=cv_splitter, scoring='f1', n_jobs=-1)
    rfecv.fit(X_train_scaled_df, y_train)
    rfecv_selected = X_train_scaled_df.columns[rfecv.support_]
    print(f"     ...RFECV selected {len(rfecv_selected)} features")

    # --- 3. Embedded Selection (Extra Trees) ---
    print(f"     Running ExtraTrees...")
    etc = ExtraTreesClassifier(n_jobs=-1, random_state=RANDOM_STATE+i)
    etc.fit(X_train_scaled_df, y_train)
    etc_threshold = np.percentile(etc.feature_importances_, 25) # Select top 50%
    etc_selected = X_train_scaled_df.columns[etc.feature_importances_ >= etc_threshold]
    print(f"     ...ExtraTrees selected {len(etc_selected)} features")

    # --- Tally votes (must appear in at least 2 of 3 methods) ---
    all_selected = set(mig_selected).union(rfecv_selected).union(etc_selected)
    for feature in all_selected:
        votes = 0
        if feature in mig_selected: votes += 1
        if feature in rfecv_selected: votes += 1
        if feature in etc_selected: votes += 1
        
        if votes >= 2:
            feature_votes[feature] += 1

# Final selection: features selected in a majority of the iterations
# (e.g., in 2 out of 3 iterations)
final_vote_threshold = n_iterations // 2 + 1
selected_features = [
    feature for feature, count in feature_votes.items() 
    if count >= final_vote_threshold
]

elapsed = (time.time() - start_time) / 60
print(f"\n✓ MSFS completed in {elapsed:.2f} minutes")

# --- Filter final datasets ---
X_train_selected = X_train_scaled_df[selected_features].values
X_test_selected = X_test_scaled_df[selected_features].values

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} features")
print(f"  Eliminated: {n_total_features - len(selected_features)} features")


--- Applying MSFS_Selector (Ensemble Voting) ---

  -> Iteration 1/3...
     Running MIG...
     ...MIG selected 30 features
     Running RFECV (this is the slowest part)...
     ...RFECV selected 4 features
     Running ExtraTrees...
     ...ExtraTrees selected 30 features

  -> Iteration 2/3...
     Running MIG...
     ...MIG selected 30 features
     Running RFECV (this is the slowest part)...
     ...RFECV selected 4 features
     Running ExtraTrees...
     ...ExtraTrees selected 30 features

  -> Iteration 3/3...
     Running MIG...
     ...MIG selected 30 features
     Running RFECV (this is the slowest part)...
     ...RFECV selected 4 features
     Running ExtraTrees...
     ...ExtraTrees selected 30 features

✓ MSFS completed in 6.05 minutes

Feature Selection Results:
  Selected: 30 features
  Eliminated: 10 features


In [4]:
print(f"\n--- Training Model on {ALGORITHM_NAME}-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on MSFS_Selector-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.6203
  Precision (class 0): 0.9091
  Recall (class 0): 0.6355
  F1-score (class 0): 0.7481
  Precision (class 1): 0.1487
  Recall (class 1): 0.5007
  F1-score (class 1): 0.2293


In [5]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: MSFS_Selector
✓ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
✓ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\msfs_selector_results.json

  Original: 30/40 features
  Encoded:  30/40 features
  Accuracy: 0.6203
  F1 (class 1): 0.2293

MSFS_Selector COMPLETE
