In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS,
    RANDOM_STATE
)
import time

ALGORITHM_NAME = "MIDTR_Selector"
# !! IMPORTANT: Update this path to your local file path !!
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: MIDTR_Selector


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying {ALGORITHM_NAME} (MI + DecisionTree) ---")

# --- Step 1: Mutual Information Selection (k1) ---
n_total = X_train_encoded.shape[1]
k1 = int(0.7 * n_total)

print(f"Step 1: Selecting top {k1} of {n_total} features using Mutual Info...")
start_time = time.time()

# Use mutual_info_classif for classification
selector_mi = SelectKBest(score_func=mutual_info_classif, k=k1)
X_train_k1 = selector_mi.fit_transform(X_train_encoded, y_train)
X_test_k1 = selector_mi.transform(X_test_encoded)

# Get the names of the features selected by MI
mi_selected_indices = selector_mi.get_support(indices=True)
mi_selected_features = [all_encoded_features[i] for i in mi_selected_indices]

elapsed_mi = time.time() - start_time
print(f"✓ MI selection complete in {elapsed_mi:.2f}s")


# --- Step 2: Decision Tree Importance Selection (k2) ---
k2 = int(0.7 * k1)
print(f"\nStep 2: Selecting top {k2} of {k1} features using Decision Tree...")
start_time_dt = time.time()

# Use DecisionTreeClassifier for classification
tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree.fit(X_train_k1, y_train)

# Get feature importances and sort them
feature_importances = tree.feature_importances_
sorted_indices_dt = np.argsort(feature_importances)[::-1]

# Select the top k2 features from the sorted list
final_selected_indices_in_k1 = sorted_indices_dt[:k2]

# Map these indices back to the original feature names
selected_features = [
    mi_selected_features[i] for i in final_selected_indices_in_k1
]

elapsed_dt = time.time() - start_time_dt
print(f"✓ Decision Tree selection complete in {elapsed_dt:.2f}s")


# --- Step 3: Filter final datasets ---

# Get the column indices in the *original* encoded data
original_indices_mapping = {feat: i for i, feat in enumerate(all_encoded_features)}
final_selected_original_indices = [original_indices_mapping[feat] for feat in selected_features]

X_train_selected = X_train_encoded[:, final_selected_original_indices]
X_test_selected = X_test_encoded[:, final_selected_original_indices]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} features")
print(f"  Eliminated: {len(all_encoded_features) - len(selected_features)} features")


--- Applying MIDTR_Selector (MI + DecisionTree) ---
Step 1: Selecting top 28 of 40 features using Mutual Info...
✓ MI selection complete in 7.62s

Step 2: Selecting top 19 of 28 features using Decision Tree...
✓ Decision Tree selection complete in 0.30s

Feature Selection Results:
  Selected: 19 features
  Eliminated: 21 features


In [4]:
print(f"\n--- Training Model on {ALGORITHM_NAME}-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on MIDTR_Selector-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.3960
  Precision (class 0): 0.8564
  Recall (class 0): 0.3834
  F1-score (class 0): 0.5297
  Precision (class 1): 0.0926
  Recall (class 1): 0.4948
  F1-score (class 1): 0.1560


In [5]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)


--- Saving Results ---

Saving results for: MIDTR_Selector
✓ Appended to study_results.csv
✓ Saved to midtr_selector_results.json

  Original: 19/40 features
  Encoded:  19/40 features
  Accuracy: 0.3960
  F1 (class 1): 0.1560

MIDTR_Selector COMPLETE
