# Feature Selection Methods Tutorial
## Filter Methods, Wrapper Methods & Train-Test Evaluation

In [None]:
# Import libraries
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

## 1. Load and Prepare Data

In [None]:
# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

print(f"Dataset shape: {X.shape}")
print(f"Features: {len(feature_names)}")
print(f"Classes: {np.unique(y)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

## 2. Filter Method - F-test

In [None]:
# Select top 10 features using F-test
selector_f = SelectKBest(f_classif, k=10)
X_train_f = selector_f.fit_transform(X_train_scaled, y_train)
X_test_f = selector_f.transform(X_test_scaled)

# Show selected features
selected_features = feature_names[selector_f.get_support()]
print(f"Selected features ({len(selected_features)}):")
for i, feature in enumerate(selected_features):
    print(f"{i+1}. {feature}")

## 3. Wrapper Method - RFE

In [None]:
# Recursive Feature Elimination
estimator = RandomForestClassifier(n_estimators=50, random_state=42)
selector_rfe = RFE(estimator, n_features_to_select=10)
X_train_rfe = selector_rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = selector_rfe.transform(X_test_scaled)

# Show selected features
selected_features_rfe = feature_names[selector_rfe.get_support()]
print(f"RFE selected features ({len(selected_features_rfe)}):")
for i, feature in enumerate(selected_features_rfe):
    print(f"{i+1}. {feature}")

## 4. Performance Evaluation

In [None]:
def evaluate_method(X_train, X_test, method_name):
    """Evaluate performance on train and test sets"""
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    
    return {
        'method': method_name,
        'features': X_train.shape[1],
        'train_acc': train_acc,
        'test_acc': test_acc,
        'gap': train_acc - test_acc
    }

# Evaluate all methods
results = [
    evaluate_method(X_train_scaled, X_test_scaled, 'Baseline'),
    evaluate_method(X_train_f, X_test_f, 'Filter (F-test)'),
    evaluate_method(X_train_rfe, X_test_rfe, 'Wrapper (RFE)')
]

# Display results
df_results = pd.DataFrame(results)
print(df_results.round(4))

## 5. Visualization

In [None]:
import matplotlib.pyplot as plt

# Plot comparison
methods = df_results['method']
train_accs = df_results['train_acc']
test_accs = df_results['test_acc']

x = range(len(methods))
width = 0.35

plt.figure(figsize=(10, 6))
plt.bar([i - width/2 for i in x], train_accs, width, label='Train Accuracy', alpha=0.8)
plt.bar([i + width/2 for i in x], test_accs, width, label='Test Accuracy', alpha=0.8)

plt.xlabel('Feature Selection Method')
plt.ylabel('Accuracy')
plt.title('Feature Selection Methods Comparison')
plt.xticks(x, methods)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Get F-test scores
f_scores = selector_f.scores_
selected_indices = selector_f.get_support(indices=True)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'feature': feature_names[selected_indices],
    'f_score': f_scores[selected_indices]
}).sort_values('f_score', ascending=False)

print("Top 10 Features by F-score:")
print(feature_importance)

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 6))
plt.barh(range(len(feature_importance)), feature_importance['f_score'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('F-score')
plt.title('Feature Importance (F-test Scores)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 7. Key Takeaways

1. **Filter methods** are fast but may miss feature interactions
2. **Wrapper methods** consider model performance but are slower
3. **Always split data first** before feature selection
4. **Monitor overfitting** by comparing train vs test accuracy
5. **Feature selection** can maintain performance with fewer features