In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS
)
import time

ALGORITHM_NAME = "RFE_SVM"
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: RFE_SVM


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying RFE (Sklearn Implementation) ---")

# Determine number of features to select
n_features_to_select = int(0.7 * len(all_encoded_features))

print(f"\nTarget: Select {n_features_to_select} out of {len(all_encoded_features)} features")

# IMPORTANT: Use LinearSVC for classification (not SVR like in your FIFA notebook)
# LinearSVC is faster than SVC(kernel='linear') for large datasets
estimator = LinearSVC(
    random_state=42,
    max_iter=5000,  # Increase if you get convergence warnings
    dual=False      # dual=False is faster when n_samples > n_features
)

# Initialize RFE
# step=1 means remove 1 feature per iteration (slow but accurate)
# step=0.1 means remove 10% of remaining features per iteration (faster)
selector = RFE(
    estimator=estimator,
    n_features_to_select=n_features_to_select,
    step=1,  # Remove 1 feature at a time (most accurate, but slow)
    verbose=1  # Show progress
)

print(f"\nRunning RFE (this will train ~{len(all_encoded_features) - n_features_to_select} models)...")
print("This may take several minutes...\n")

start_time = time.time()
selector.fit(X_train_encoded, y_train)
elapsed = time.time() - start_time

print(f"\n✓ RFE completed in {elapsed/60:.2f} minutes")

# Transform data
X_train_selected = selector.transform(X_train_encoded)
X_test_selected = selector.transform(X_test_encoded)

# Get selected features
selected_mask = selector.get_support()
selected_features = [all_encoded_features[i] for i, selected in enumerate(selected_mask) if selected]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} features")
print(f"  Eliminated: {len(all_encoded_features) - len(selected_features)} features")


--- Applying RFE (Sklearn Implementation) ---

Target: Select 28 out of 40 features

Running RFE (this will train ~12 models)...
This may take several minutes...

Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.

✓ RFE completed in 0.22 minutes

Feature Selection Results:
  Selected: 28 features
  Eliminated: 12 features


In [4]:
print(f"\n--- Training Model on RFE-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on RFE-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.5010
  Precision (class 0): 0.8963
  Recall (class 0): 0.4948
  F1-score (class 0): 0.6376
  Precision (class 1): 0.1216
  Recall (class 1): 0.5499
  F1-score (class 1): 0.1992


In [5]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: RFE_SVM
✓ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
✓ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\rfe_svm_results.json

  Original: 28/40 features
  Encoded:  28/40 features
  Accuracy: 0.5010
  F1 (class 1): 0.1992

RFE_SVM COMPLETE
