In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from base_module import (
    get_preprocessed_data,
    train_and_evaluate,
    save_model_results,
    SVM_PARAMS
)

ALGORITHM_NAME = "Chi-Squared"
DATA_PATH = r'D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv'

print("="*70)
print(f"FEATURE SELECTION: {ALGORITHM_NAME}")
print("="*70)

FEATURE SELECTION: Chi-Squared


In [2]:
print("\n--- Loading and Preprocessing Data ---")

X_train_encoded, X_test_encoded, y_train, y_test, \
    all_encoded_features, all_original_features, feature_mapping = \
    get_preprocessed_data(DATA_PATH)

print(f"\nStarting with {len(all_encoded_features)} encoded features")


--- Loading and Preprocessing Data ---
Loading data from D:\ACTUAL STUDY MATERIAL\IPD\Data\diabetic_data.csv...


  df = pd.read_csv(file_path, na_values='?')


Cleaned data shape: (98053, 43)
Engineering features...
Engineered data shape: (98053, 41)

Preprocessing data...
Ordinal Encoding 31 features...
Encoded features: 40
  - Numeric: 9
  - Categorical (Ordinal): 31

Starting with 40 encoded features


In [3]:
print(f"\n--- Applying Chi-Squared Test ---")

# CRITICAL: Chi-squared requires non-negative features
# After StandardScaler, features can be negative, so we need to make them non-negative

# Method 1: MinMax scaling (transforms to [0,1] range)
from sklearn.preprocessing import MinMaxScaler

# We need to refit on training data to ensure non-negative values
print("\nRe-scaling features to [0,1] range for Chi-Squared...")

# Get the numeric features (first 9 features) and categorical (rest)
n_numeric = 9  # From your feature engineering

# Scale only numeric features to [0,1]
minmax_scaler = MinMaxScaler()
X_train_numeric = X_train_encoded[:, :n_numeric]
X_test_numeric = X_test_encoded[:, :n_numeric]

X_train_numeric_scaled = minmax_scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = minmax_scaler.transform(X_test_numeric)

# Categorical features (one-hot encoded) are already 0/1, so keep them as is
X_train_categorical = X_train_encoded[:, n_numeric:]
X_test_categorical = X_test_encoded[:, n_numeric:]

# Combine back
X_train_nonneg = np.hstack([X_train_numeric_scaled, X_train_categorical])
X_test_nonneg = np.hstack([X_test_numeric_scaled, X_test_categorical])

print(f"Features are now in [0,1] range, suitable for Chi-Squared test")


# Select top K features based on Chi-Squared scores
# Rule of thumb: Select 50-70% of features
# You can adjust k based on your analysis needs
k = int(0.7 * len(all_encoded_features))  # Select 60% of features

print(f"\nSelecting top {k} features out of {len(all_encoded_features)}...")

selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_nonneg, y_train)
X_test_selected = selector.transform(X_test_nonneg)

# Get selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_features = [all_encoded_features[i] for i in selected_indices]

print(f"\nFeature Selection Results:")
print(f"  Selected: {len(selected_features)} encoded features")
print(f"  Discarded: {len(all_encoded_features) - len(selected_features)} encoded features")


--- Applying Chi-Squared Test ---

Re-scaling features to [0,1] range for Chi-Squared...
Features are now in [0,1] range, suitable for Chi-Squared test

Selecting top 28 features out of 40...

Feature Selection Results:
  Selected: 28 encoded features
  Discarded: 12 encoded features


In [4]:
print(f"\n--- Chi-Squared Score Analysis ---")

# Get Chi-Squared scores for all features
chi2_scores = selector.scores_

# Create DataFrame for analysis
feature_scores = pd.DataFrame({
    'feature': all_encoded_features,
    'chi2_score': chi2_scores,
    'selected': selector.get_support()
}).sort_values('chi2_score', ascending=False)

print("\nTop 20 features by Chi-Squared score:")
print(feature_scores.head(20)[['feature', 'chi2_score', 'selected']].to_string(index=False))

print("\nBottom 10 features by Chi-Squared score (least relevant):")
print(feature_scores.tail(10)[['feature', 'chi2_score', 'selected']].to_string(index=False))

# Save feature scores to CSV for later analysis
feature_scores.to_csv('chi_squared_feature_scores.csv', index=False)


--- Chi-Squared Score Analysis ---

Top 20 features by Chi-Squared score:
                 feature  chi2_score  selected
discharge_disposition_id 5073.995571      True
        number_inpatient  249.820828      True
          diag_1_grouped   84.464633      True
     admission_source_id   60.635297      True
        time_in_hospital   30.877288      True
        number_emergency   19.441988      True
             diabetesMed   15.215758      True
                  change   13.951422      True
       admission_type_id   10.931185      True
        number_diagnoses   10.215756      True
               metformin    6.766354      True
         num_medications    6.133392      True
          num_procedures    4.484126      True
       number_outpatient    2.564075      True
      num_lab_procedures    2.368612      True
             age_encoded    0.554017      True
             glimepiride    0.406368      True
             tolbutamide    0.405290      True
                  gender    0.36

In [5]:
print(f"\n--- Training Model on Selected Features ---")

clf, y_pred, report = train_and_evaluate(
    X_train_selected, X_test_selected,
    y_train, y_test,
    **SVM_PARAMS
)

print(f"\nModel Performance:")
print(f"  Accuracy: {report['accuracy']:.4f}")
print(f"  Precision (class 0): {report['0']['precision']:.4f}")
print(f"  Recall (class 0): {report['0']['recall']:.4f}")
print(f"  F1-score (class 0): {report['0']['f1-score']:.4f}")
print(f"  Precision (class 1): {report['1']['precision']:.4f}")
print(f"  Recall (class 1): {report['1']['recall']:.4f}")
print(f"  F1-score (class 1): {report['1']['f1-score']:.4f}")


--- Training Model on Selected Features ---

Training SVM classifier...




Evaluating...

Model Performance:
  Accuracy: 0.5919
  Precision (class 0): 0.8820
  Recall (class 0): 0.6233
  F1-score (class 0): 0.7305
  Precision (class 1): 0.1042
  Recall (class 1): 0.3443
  F1-score (class 1): 0.1599


In [6]:
print(f"\n--- Saving Results ---")

save_model_results(
    algorithm_name=ALGORITHM_NAME,
    selected_encoded_features=selected_features,
    all_encoded_features=all_encoded_features,
    all_original_features=all_original_features,
    feature_mapping=feature_mapping,
    report=report,
    results_csv='study_results.csv'
)

print("\n" + "="*70)
print(f"{ALGORITHM_NAME} COMPLETE")
print("="*70)



--- Saving Results ---

Saving results for: Chi-Squared
✓ Appended to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\study_results.csv
✓ Saved to D:\ACTUAL STUDY MATERIAL\IPD\src\Results\chi-squared_results.json

  Original: 28/40 features
  Encoded:  28/40 features
  Accuracy: 0.5919
  F1 (class 1): 0.1599

Chi-Squared COMPLETE
