In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS,
    RANDOM_STATE
)
import time

ALGORITHM_NAME = "RFECV_SVM"
# !! IMPORTANT: Update this path to your local file path !!
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: RFECV_SVM


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying {ALGORITHM_NAME} (Sklearn Implementation) ---")
print("WARNING: This will be very slow. It runs RFE multiple times.")

# --- 1. Define the Estimator ---
# We use LinearSVC because it's fast and has .coef_
estimator = LinearSVC(
    random_state=RANDOM_STATE,
    max_iter=5000,
    dual=False
)

# --- 2. Define the Cross-validation Strategy ---
# Use StratifiedKFold for classification to maintain class balance
cv_splitter = StratifiedKFold(
    n_splits=5,  # 5-fold CV. Reduce to 3 to make it faster
    shuffle=True, 
    random_state=RANDOM_STATE
)

# --- 3. Initialize RFECV ---
# It will find the best number of features automatically
selector = RFECV(
    estimator=estimator,
    step=1,               # Remove 1 feature at a time
    cv=cv_splitter,       # Use our stratified CV
    scoring='f1',         # Optimize for F1-score (good for imbalance)
    n_jobs=-1,            # Use all available CPU cores
    verbose=1             # Show progress
)

print(f"\nRunning RFECV... (This may take 30-60 minutes)\n")
start_time = time.time()
selector.fit(X_train_encoded, y_train)
elapsed = (time.time() - start_time) / 60

print(f"\n✓ RFECV completed in {elapsed:.2f} minutes")

# --- 4. Get Results ---
X_train_selected = selector.transform(X_train_encoded)
X_test_selected = selector.transform(X_test_encoded)

selected_mask = selector.get_support()
selected_features = [all_encoded_features[i] for i, selected in enumerate(selected_mask) if selected]

print(f"\nFeature Selection Results:")
print(f"  Optimal number found: {selector.n_features_}")
print(f"  Selected: {len(selected_features)} features")
print(f"  Eliminated: {len(all_encoded_features) - len(selected_features)} features")


--- Applying RFECV_SVM (Sklearn Implementation) ---

Running RFECV... (This may take 30-60 minutes)

Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.

✓ RFECV completed in 0.72 minutes

Feature Selection Results:
  Optimal number found: 33
  Selected: 33 features
  Eliminated: 7 features


In [4]:
print(f"\n--- Training Model on {ALGORITHM_NAME}-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on RFECV_SVM-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.3807
  Precision (class 0): 0.8525
  Recall (class 0): 0.3651
  F1-score (class 0): 0.5112
  Precision (class 1): 0.0916
  Recall (class 1): 0.5034
  F1-score (class 1): 0.1550


In [5]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: RFECV_SVM
✓ Appended to study_results.csv
✓ Saved to rfecv_svm_results.json

  Original: 33/40 features
  Encoded:  33/40 features
  Accuracy: 0.3807
  F1 (class 1): 0.1550

RFECV_SVM COMPLETE
