In [21]:
# ============================================================================
# MEMORY-EFFICIENT MODELING WITH EXPLAINABILITY
# Includes: PCA, RFE, Baseline Models, SHAP, LIME
# ============================================================================

import pandas as pd
import numpy as np
import gc
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from time import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [22]:
# ============================================================================
# 1. LOAD PREPROCESSED DATA
# ============================================================================

print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = pd.read_csv('preprocessed_train.csv')
X_test = pd.read_csv('preprocessed_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Memory: {(X_train.memory_usage().sum() + X_test.memory_usage().sum()) / 1024**2:.2f} MB")

# Optimize dtypes
for col in X_train.columns:
    if X_train[col].dtype == 'float64':
        X_train[col] = X_train[col].astype('float32')
        X_test[col] = X_test[col].astype('float32')

gc.collect()


LOADING PREPROCESSED DATA
Train: (156156, 98), Test: (39040, 98)
Memory: 145.94 MB


10222

In [23]:
# ============================================================================
# 2. PCA - DIMENSIONALITY REDUCTION
# ============================================================================

print("\n" + "="*80)
print("PCA - DIMENSIONALITY REDUCTION")
print("="*80)

n_components = min(50, X_train.shape[1] // 2)
pca = PCA(n_components=n_components, random_state=42)

print(f"Fitting PCA with {n_components} components...")
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

explained_var = np.cumsum(pca.explained_variance_ratio_)
n_95 = np.argmax(explained_var >= 0.95) + 1

print(f"✓ Explained variance: {explained_var[-1]:.2%}")
print(f"✓ Components for 95% variance: {n_95}")

X_train_pca_df = pd.DataFrame(
    X_train_pca,
    columns=[f'PC{i+1}' for i in range(n_components)]
).astype('float32')

X_test_pca_df = pd.DataFrame(
    X_test_pca,
    columns=[f'PC{i+1}' for i in range(n_components)]
).astype('float32')

del X_train_pca, X_test_pca
gc.collect()



PCA - DIMENSIONALITY REDUCTION
Fitting PCA with 49 components...
✓ Explained variance: 87.27%
✓ Components for 95% variance: 1


0

In [24]:
# ============================================================================
# 3. RFE - RECURSIVE FEATURE ELIMINATION
# ============================================================================

print("\n" + "="*80)
print("RFE - FEATURE SELECTION")
print("="*80)

# Use smaller sample for RFE to save memory
sample_size = min(50000, len(X_train))
sample_idx = np.random.choice(len(X_train), sample_size, replace=False)

rf_estimator = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

n_features_to_select = min(30, X_train.shape[1])
rfe = RFE(estimator=rf_estimator, n_features_to_select=n_features_to_select, step=10)

print(f"Running RFE on {sample_size} samples to select {n_features_to_select} features...")
rfe.fit(X_train.iloc[sample_idx], y_train.iloc[sample_idx])

selected_rfe_features = X_train.columns[rfe.support_].tolist()

X_train_rfe = X_train[selected_rfe_features].astype('float32')
X_test_rfe = X_test[selected_rfe_features].astype('float32')

print(f"✓ Selected features: {selected_rfe_features[:10]}...")

gc.collect()



RFE - FEATURE SELECTION
Running RFE on 50000 samples to select 30 features...
✓ Selected features: ['VISITDAY', 'VISITYR', 'NACCVNUM', 'BIRTHYR', 'INDEPEND', 'INBIRYR', 'HEIGHT', 'WEIGHT', 'BPSYS', 'HRATE']...


232

In [25]:
# ============================================================================
# 4. BASELINE MODELS - MULTIPLE ALGORITHMS
# ============================================================================

print("\n" + "="*80)
print("BASELINE MODELING")
print("="*80)

# Define models with memory-efficient settings
models = {
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=42, n_jobs=-1),
    'Random Forest': RandomForestClassifier(
        n_estimators=100, 
        max_depth=10, 
        min_samples_split=100,
        random_state=42, 
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100, 
        learning_rate=0.1, 
        max_depth=5,
        subsample=0.8,
        random_state=42
    )
}

# Feature sets
feature_sets = {
    'Original': (X_train, X_test),
    'PCA': (X_train_pca_df, X_test_pca_df),
    'RFE': (X_train_rfe, X_test_rfe)
}

results = []

for feat_name, (X_tr, X_te) in feature_sets.items():
    print(f"\n{'='*60}")
    print(f"Feature Set: {feat_name} ({X_tr.shape[1]} features)")
    print(f"{'='*60}")
    
    for model_name, model in models.items():
        try:
            start = time()
            
            # Train
            print(f"\nTraining {model_name}...")
            model.fit(X_tr, y_train)
            
            # Predict
            y_pred = model.predict(X_te)
            y_proba = model.predict_proba(X_te)[:, 1] if hasattr(model, 'predict_proba') else None
            
            # Metrics
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            auc = roc_auc_score(y_test, y_proba) if y_proba is not None else 0
            
            elapsed = time() - start
            
            print(f"  Accuracy:  {acc:.4f}")
            print(f"  Precision: {prec:.4f}")
            print(f"  Recall:    {rec:.4f}")
            print(f"  F1-Score:  {f1:.4f}")
            print(f"  ROC-AUC:   {auc:.4f}")
            print(f"  Time:      {elapsed:.2f}s")
            
            results.append({
                'Feature_Set': feat_name,
                'Model': model_name,
                'Accuracy': acc,
                'Precision': prec,
                'Recall': rec,
                'F1': f1,
                'AUC': auc,
                'Time': elapsed
            })
            
            gc.collect()
            
        except Exception as e:
            print(f"  ERROR: {str(e)}")

# Summary
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)
print(results_df.sort_values('F1', ascending=False).to_string(index=False))

results_df.to_csv('model_results.csv', index=False)
print("\n✓ Results saved to 'model_results.csv'")



BASELINE MODELING

Feature Set: Original (98 features)

Training Logistic Regression...
  Accuracy:  0.9506
  Precision: 0.9321
  Recall:    0.8979
  F1-Score:  0.9147
  ROC-AUC:   0.9879
  Time:      3.70s

Training Random Forest...
  Accuracy:  0.9501
  Precision: 0.9339
  Recall:    0.8943
  F1-Score:  0.9136
  ROC-AUC:   0.9886
  Time:      5.09s

Training Gradient Boosting...
  Accuracy:  0.9532
  Precision: 0.9295
  Recall:    0.9102
  F1-Score:  0.9198
  ROC-AUC:   0.9897
  Time:      88.43s

Feature Set: PCA (49 features)

Training Logistic Regression...
  Accuracy:  0.9478
  Precision: 0.9303
  Recall:    0.8898
  F1-Score:  0.9096
  ROC-AUC:   0.9865
  Time:      1.45s

Training Random Forest...
  Accuracy:  0.9297
  Precision: 0.8901
  Recall:    0.8690
  F1-Score:  0.8794
  ROC-AUC:   0.9755
  Time:      14.86s

Training Gradient Boosting...
  Accuracy:  0.9446
  Precision: 0.9166
  Recall:    0.8934
  F1-Score:  0.9049
  ROC-AUC:   0.9848
  Time:      423.10s

Feature Set

In [26]:
# ============================================================================
# 5. TRAIN BEST MODEL FOR EXPLAINABILITY
# ============================================================================

print("\n" + "="*80)
print("TRAINING BEST MODEL")
print("="*80)

# Use Random Forest as best model
best_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=100,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest for explainability...")
best_model.fit(X_train, y_train)
print("✓ Model trained")

gc.collect()



TRAINING BEST MODEL
Training Random Forest for explainability...
✓ Model trained


29

In [27]:
# ============================================================================
# 6. SHAP - EXPLAINABILITY (CRITICAL!)
# ============================================================================

print("\n" + "="*80)
print("SHAP - MODEL EXPLAINABILITY")
print("="*80)

try:
    import shap
    
    # Use smaller sample for SHAP (memory intensive)
    sample_size = min(500, len(X_test))
    X_test_sample = X_test.sample(sample_size, random_state=42)
    
    print(f"Creating SHAP explainer for {sample_size} samples...")
    explainer = shap.TreeExplainer(best_model, X_train.sample(min(1000, len(X_train))))
    
    print("Calculating SHAP values...")
    shap_values = explainer.shap_values(X_test_sample)
    
    # For binary classification
    if isinstance(shap_values, list):
        shap_values = shap_values[1]
    
    # Summary plot
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_test_sample, show=False, max_display=20)
    plt.title("SHAP Feature Importance", fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('shap_summary.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ SHAP summary plot saved")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': np.abs(shap_values).mean(axis=0)
    }).sort_values('importance', ascending=False)
    
    print("\n✓ Top 15 Features by SHAP:")
    print(feature_importance.head(15).to_string(index=False))
    
    feature_importance.to_csv('shap_feature_importance.csv', index=False)
    print("\n✓ SHAP importance saved to 'shap_feature_importance.csv'")
    
    # Dependence plot for top feature
    top_feature = feature_importance.iloc[0]['feature']
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(
        top_feature, 
        shap_values, 
        X_test_sample,
        show=False
    )
    plt.title(f"SHAP Dependence Plot: {top_feature}", fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('shap_dependence.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✓ SHAP dependence plot saved for {top_feature}")
    
    del explainer, shap_values
    gc.collect()
    
except ImportError:
    print("⚠ SHAP not installed. Run: pip install shap")
except Exception as e:
    print(f"⚠ SHAP error: {str(e)}")



SHAP - MODEL EXPLAINABILITY
Creating SHAP explainer for 500 samples...
Calculating SHAP values...




✓ SHAP summary plot saved
⚠ SHAP error: Per-column arrays must each be 1-dimensional


<Figure size 1000x800 with 0 Axes>

In [28]:
# ============================================================================
# 7. LIME - LOCAL EXPLANATIONS
# ============================================================================

print("\n" + "="*80)
print("LIME - LOCAL INTERPRETABILITY")
print("="*80)

try:
    from lime.lime_tabular import LimeTabularExplainer
    
    # Create LIME explainer with smaller sample
    sample_train = X_train.sample(min(5000, len(X_train)), random_state=42)
    
    lime_explainer = LimeTabularExplainer(
        sample_train.values,
        feature_names=X_train.columns.tolist(),
        class_names=['No Dementia', 'Dementia'],
        mode='classification',
        random_state=42
    )
    
    # Explain 3 predictions (1 dementia, 1 no dementia, 1 borderline)
    dementia_idx = y_test[y_test == 1].index[0]
    no_dementia_idx = y_test[y_test == 0].index[0]
    
    instances = [
        ('Dementia Case', X_test.loc[dementia_idx].values),
        ('No Dementia Case', X_test.loc[no_dementia_idx].values)
    ]
    
    for i, (label, instance) in enumerate(instances):
        print(f"\nExplaining {label}...")
        
        exp = lime_explainer.explain_instance(
            instance,
            best_model.predict_proba,
            num_features=10
        )
        
        # Save as figure
        fig = exp.as_pyplot_figure()
        plt.title(f"LIME Explanation: {label}", fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig(f'lime_explanation_{i+1}.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"✓ LIME explanation {i+1} saved")
    
    print("\n✓ LIME explanations complete")
    
    del lime_explainer
    gc.collect()
    
except ImportError:
    print("⚠ LIME not installed. Run: pip install lime")
except Exception as e:
    print(f"⚠ LIME error: {str(e)}")



LIME - LOCAL INTERPRETABILITY

Explaining Dementia Case...
✓ LIME explanation 1 saved

Explaining No Dementia Case...
✓ LIME explanation 2 saved

✓ LIME explanations complete


In [29]:
# ============================================================================
# 8. FEATURE IMPORTANCE FROM RANDOM FOREST
# ============================================================================

print("\n" + "="*80)
print("RANDOM FOREST FEATURE IMPORTANCE")
print("="*80)

rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n✓ Top 15 Features by Random Forest:")
print(rf_importance.head(15).to_string(index=False))

# Plot
plt.figure(figsize=(10, 8))
plt.barh(rf_importance['feature'][:20][::-1], rf_importance['importance'][:20][::-1])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Random Forest Feature Importance (Top 20)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

rf_importance.to_csv('rf_feature_importance.csv', index=False)
print("\n✓ RF importance saved to 'rf_feature_importance.csv'")


RANDOM FOREST FEATURE IMPORTANCE

✓ Top 15 Features by Random Forest:
       feature  importance
        COMMUN    0.125360
        MEMORY    0.124515
       CDRGLOB    0.108755
      JUDGMENT    0.095744
      SHOPPING    0.059369
        ORIENT    0.059151
      HOMEHOBB    0.049662
      REMDATES    0.046906
      INDEPEND    0.041238
ADL_IMPAIRMENT    0.040434
         BILLS    0.035710
        TRAVEL    0.033976
        EVENTS    0.032757
         TAXES    0.027671
         GAMES    0.024485

✓ RF importance saved to 'rf_feature_importance.csv'


In [30]:
# ============================================================================
# 9. FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("MODELING & EXPLAINABILITY COMPLETE!")
print("="*80)

best_result = results_df.loc[results_df['F1'].idxmax()]

print(f"""
Best Model Performance:
- Feature Set: {best_result['Feature_Set']}
- Model: {best_result['Model']}
- Accuracy: {best_result['Accuracy']:.4f}
- Precision: {best_result['Precision']:.4f}
- Recall: {best_result['Recall']:.4f}
- F1-Score: {best_result['F1']:.4f}
- ROC-AUC: {best_result['AUC']:.4f}

Generated Files:
✓ model_results.csv - All model results
✓ shap_summary.png - SHAP feature importance
✓ shap_dependence.png - SHAP dependence plot
✓ shap_feature_importance.csv - SHAP importance scores
✓ lime_explanation_1.png - LIME explanation (dementia)
✓ lime_explanation_2.png - LIME explanation (no dementia)
✓ rf_feature_importance.png - RF importance plot
✓ rf_feature_importance.csv - RF importance scores
""")


MODELING & EXPLAINABILITY COMPLETE!

Best Model Performance:
- Feature Set: Original
- Model: Gradient Boosting
- Accuracy: 0.9532
- Precision: 0.9295
- Recall: 0.9102
- F1-Score: 0.9198
- ROC-AUC: 0.9897

Generated Files:
✓ model_results.csv - All model results
✓ shap_summary.png - SHAP feature importance
✓ shap_dependence.png - SHAP dependence plot
✓ shap_feature_importance.csv - SHAP importance scores
✓ lime_explanation_1.png - LIME explanation (dementia)
✓ lime_explanation_2.png - LIME explanation (no dementia)
✓ rf_feature_importance.png - RF importance plot
✓ rf_feature_importance.csv - RF importance scores

