In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS
)

ALGORITHM_NAME = "Mutual_Information"
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: Mutual_Information


In [3]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [4]:
print(f"\n--- Applying Mutual Information ---")

# IMPORTANT: Your FIFA notebook used mutual_info_regression (for continuous targets)
# For diabetes classification, we MUST use mutual_info_classif

# Select top k features
# You can adjust this percentage (60-70% is typical)
k = int(0.7 * len(all_encoded_features))

print(f"\nSelecting top {k} features out of {len(all_encoded_features)}...")

# Mutual Information for classification
# random_state ensures reproducibility
selector = SelectKBest(
    score_func=mutual_info_classif,
    k=k
)

# Fit on training data
# MI can handle both continuous and discrete features naturally
selector.fit(X_train_encoded, y_train)

# Transform data
X_train_selected = selector.transform(X_train_encoded)
X_test_selected = selector.transform(X_test_encoded)

# Get selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_features = [all_encoded_features[i] for i in selected_indices]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} encoded features")
print(f"  Discarded: {len(all_encoded_features) - len(selected_features)} encoded features")


--- Applying Mutual Information ---

Selecting top 28 features out of 40...

Feature Selection Results:
  Selected: 28 encoded features
  Discarded: 12 encoded features


In [5]:
print(f"\n--- Mutual Information Score Analysis ---")

# Get MI scores for all features
mi_scores = selector.scores_

# Create DataFrame for analysis
feature_scores = pd.DataFrame({
    'feature': all_encoded_features,
    'mi_score': mi_scores,
    'selected': selector.get_support()
}).sort_values('mi_score', ascending=False)

print("\nTop 20 features by Mutual Information score:")
print(feature_scores.head(20)[['feature', 'mi_score', 'selected']].to_string(index=False))

print("\nBottom 10 features by Mutual Information score:")
print(feature_scores.tail(10)[['feature', 'mi_score', 'selected']].to_string(index=False))

# Features with MI score of 0 have no relationship with target
zero_mi_features = feature_scores[feature_scores['mi_score'] == 0]
if len(zero_mi_features) > 0:
    print(f"\n  {len(zero_mi_features)} features have MI score = 0 (completely independent of target)")
    print("These features should definitely be discarded.")

# Save feature scores to CSV
feature_scores.to_csv('mutual_information_feature_scores.csv', index=False)
print("\nâœ“ Saved detailed MI scores to 'mutual_information_feature_scores.csv'")



--- Mutual Information Score Analysis ---

Top 20 features by Mutual Information score:
                 feature  mi_score  selected
             glimepiride  0.018180      True
                acarbose  0.016027      True
        number_inpatient  0.015878      True
             nateglinide  0.014893      True
             diabetesMed  0.014338      True
     glyburide-metformin  0.013791      True
                    race  0.013715      True
                miglitol  0.011398      True
                  change  0.011369      True
               metformin  0.010533      True
             repaglinide  0.009842      True
               glyburide  0.009676      True
            pioglitazone  0.009591      True
               glipizide  0.009419      True
          chlorpropamide  0.009048      True
     admission_source_id  0.008755      True
discharge_disposition_id  0.008510      True
                  gender  0.008427      True
           rosiglitazone  0.008357      True
        num

In [6]:
print(f"\n--- MI Score Statistics ---")

selected_scores = feature_scores[feature_scores['selected']]['mi_score']
rejected_scores = feature_scores[~feature_scores['selected']]['mi_score']

print(f"\nSelected features:")
print(f"  Mean MI score: {selected_scores.mean():.6f}")
print(f"  Median MI score: {selected_scores.median():.6f}")
print(f"  Min MI score: {selected_scores.min():.6f}")
print(f"  Max MI score: {selected_scores.max():.6f}")

print(f"\nRejected features:")
print(f"  Mean MI score: {rejected_scores.mean():.6f}")
print(f"  Median MI score: {rejected_scores.median():.6f}")
print(f"  Min MI score: {rejected_scores.min():.6f}")
print(f"  Max MI score: {rejected_scores.max():.6f}")

# The threshold used for selection
threshold = selected_scores.min()
print(f"\nSelection threshold (min selected MI score): {threshold:.6f}")



--- MI Score Statistics ---

Selected features:
  Mean MI score: 0.008963
  Median MI score: 0.009233
  Min MI score: 0.001760
  Max MI score: 0.018180

Rejected features:
  Mean MI score: 0.000434
  Median MI score: 0.000404
  Min MI score: 0.000000
  Max MI score: 0.001044

Selection threshold (min selected MI score): 0.001760


In [7]:
print(f"\n--- Training Model on MI-Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")



--- Training Model on MI-Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.3851
  Precision (class 0): 0.8805
  Recall (class 0): 0.3550
  F1-score (class 0): 0.5060
  Precision (class 1): 0.1092
  Recall (class 1): 0.6213
  F1-score (class 1): 0.1857


In [8]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)



--- Saving Results ---

Saving results for: Mutual_Information
âœ“ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
âœ“ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\mutual_information_results.json

  Original: 28/40 features
  Encoded:  28/40 features
  Accuracy: 0.3851
  F1 (class 1): 0.1857

Mutual_Information COMPLETE


In [9]:
print(f"\nðŸ“Š Summary for Research Paper:")
print(f"   - Algorithm: Mutual Information (classification)")
print(f"   - Features evaluated: {len(all_encoded_features)} (post-encoding)")
print(f"   - Features selected: {len(selected_features)} ({100*len(selected_features)/len(all_encoded_features):.1f}%)")
print(f"   - Selection method: SelectKBest with k={k}")
print(f"   - MI score range: [{mi_scores.min():.6f}, {mi_scores.max():.6f}]")
print(f"   - Features with MI=0: {len(zero_mi_features)}")
print(f"   - Model accuracy: {report['accuracy']:.4f}")
print(f"   - F1-score (class 1): {report['1']['f1-score']:.4f}")


ðŸ“Š Summary for Research Paper:
   - Algorithm: Mutual Information (classification)
   - Features evaluated: 40 (post-encoding)
   - Features selected: 28 (70.0%)
   - Selection method: SelectKBest with k=28
   - MI score range: [0.000000, 0.018180]
   - Features with MI=0: 4
   - Model accuracy: 0.3851
   - F1-score (class 1): 0.1857
