In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.feature_selection import (
    SelectKBest, f_classif, chi2, mutual_info_classif,
    RFE, RFECV, SelectFromModel
)
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Load the breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Create feature names
feature_names = data.feature_names

print(f"Original dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution: {np.bincount(y)}")

# Add some noisy features to make feature selection more meaningful
np.random.seed(42)
noise_features = np.random.randn(X.shape[0], 20)
X_with_noise = np.hstack([X, noise_features])

# Update feature names
noise_names = [f'noise_feature_{i}' for i in range(20)]
all_feature_names = list(feature_names) + noise_names

print(f"Dataset with noise shape: {X_with_noise.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_with_noise, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


In [None]:
# Standardize features for statistical tests
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1.1 ANOVA F-test
print("=== ANOVA F-test (Filter Method) ===")
f_selector = SelectKBest(score_func=f_classif, k=15)
X_train_f = f_selector.fit_transform(X_train_scaled, y_train)
X_test_f = f_selector.transform(X_test_scaled)

# Get selected features
f_selected_features = f_selector.get_support(indices=True)
f_scores = f_selector.scores_

print(f"Selected {len(f_selected_features)} features using F-test")
print("Top 10 features by F-score:")
feature_scores_f = list(zip(all_feature_names, f_scores))
feature_scores_f.sort(key=lambda x: x[1], reverse=True)
for name, score in feature_scores_f[:10]:
    print(f"  {name}: {score:.2f}")


In [None]:
# 1.2 Mutual Information
print("\n=== Mutual Information (Filter Method) ===")
mi_selector = SelectKBest(score_func=mutual_info_classif, k=15)
X_train_mi = mi_selector.fit_transform(X_train_scaled, y_train)
X_test_mi = mi_selector.transform(X_test_scaled)

mi_selected_features = mi_selector.get_support(indices=True)
mi_scores = mi_selector.scores_

print(f"Selected {len(mi_selected_features)} features using Mutual Information")
print("Top 10 features by MI score:")
feature_scores_mi = list(zip(all_feature_names, mi_scores))
feature_scores_mi.sort(key=lambda x: x[1], reverse=True)
for name, score in feature_scores_mi[:10]:
    print(f"  {name}: {score:.4f}")


In [None]:
# Visualize filter method results
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# F-test scores
axes[0].bar(range(len(f_scores)), f_scores, alpha=0.7)
axes[0].set_title('ANOVA F-test Scores for All Features')
axes[0].set_xlabel('Feature Index')
axes[0].set_ylabel('F-score')
axes[0].axhline(y=np.mean(f_scores), color='r', linestyle='--', label='Mean F-score')
axes[0].legend()

# Mutual Information scores
axes[1].bar(range(len(mi_scores)), mi_scores, alpha=0.7, color='orange')
axes[1].set_title('Mutual Information Scores for All Features')
axes[1].set_xlabel('Feature Index')
axes[1].set_ylabel('MI Score')
axes[1].axhline(y=np.mean(mi_scores), color='r', linestyle='--', label='Mean MI score')
axes[1].legend()

plt.tight_layout()
plt.show()

# Note: Features 30-49 are the noise features we added
print("\nNote: Features 30-49 are artificially added noise features")
print("Good feature selection should rank them lower!")


In [None]:
# 2.1 Recursive Feature Elimination (RFE)
print("=== Recursive Feature Elimination (Wrapper Method) ===")

# Use logistic regression as the estimator
estimator = LogisticRegression(random_state=42, max_iter=1000)
rfe_selector = RFE(estimator=estimator, n_features_to_select=15)
X_train_rfe = rfe_selector.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe_selector.transform(X_test_scaled)

rfe_selected_features = rfe_selector.get_support(indices=True)
rfe_ranking = rfe_selector.ranking_

print(f"Selected {len(rfe_selected_features)} features using RFE")
print("Selected features:")
for idx in rfe_selected_features:
    print(f"  {all_feature_names[idx]}")


In [None]:
# 2.2 Recursive Feature Elimination with Cross-Validation (RFECV)
print("\n=== RFE with Cross-Validation (Wrapper Method) ===")

rfecv_selector = RFECV(
    estimator=LogisticRegression(random_state=42, max_iter=1000),
    step=1,
    cv=5,
    scoring='accuracy'
)
X_train_rfecv = rfecv_selector.fit_transform(X_train_scaled, y_train)
X_test_rfecv = rfecv_selector.transform(X_test_scaled)

rfecv_selected_features = rfecv_selector.get_support(indices=True)

print(f"Optimal number of features: {rfecv_selector.n_features_}")
print(f"Selected {len(rfecv_selected_features)} features using RFECV")

# Plot RFECV results
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv_selector.cv_results_['mean_test_score']) + 1), 
         rfecv_selector.cv_results_['mean_test_score'])
plt.xlabel('Number of Features Selected')
plt.ylabel('Cross-Validation Accuracy')
plt.title('RFECV: Optimal Number of Features')
plt.axvline(x=rfecv_selector.n_features_, color='r', linestyle='--', 
           label=f'Optimal: {rfecv_selector.n_features_} features')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# 3.1 LASSO Regularization (L1)
print("=== LASSO Regularization (Embedded Method) ===")

# Try different alpha values
alphas = [0.001, 0.01, 0.1, 1.0]
lasso_results = {}

for alpha in alphas:
    lasso = Lasso(alpha=alpha, random_state=42, max_iter=1000)
    lasso.fit(X_train_scaled, y_train)
    
    # Count non-zero coefficients
    n_selected = np.sum(lasso.coef_ != 0)
    lasso_results[alpha] = {
        'n_features': n_selected,
        'coefficients': lasso.coef_
    }
    
    print(f"Alpha={alpha}: {n_selected} features selected")

# Use SelectFromModel with LASSO
lasso_selector = SelectFromModel(Lasso(alpha=0.01, random_state=42, max_iter=1000))
X_train_lasso = lasso_selector.fit_transform(X_train_scaled, y_train)
X_test_lasso = lasso_selector.transform(X_test_scaled)

lasso_selected_features = lasso_selector.get_support(indices=True)
print(f"\nSelected {len(lasso_selected_features)} features using LASSO (alpha=0.01)")


In [None]:
# 3.2 Random Forest Feature Importance
print("\n=== Random Forest Feature Importance (Embedded Method) ===")

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Get feature importances
feature_importances = rf.feature_importances_

# Use SelectFromModel
rf_selector = SelectFromModel(rf, prefit=True)
X_train_rf = rf_selector.transform(X_train_scaled)
X_test_rf = rf_selector.transform(X_test_scaled)

rf_selected_features = rf_selector.get_support(indices=True)
print(f"Selected {len(rf_selected_features)} features using Random Forest importance")

# Show top features by importance
print("Top 10 features by Random Forest importance:")
feature_importance_pairs = list(zip(all_feature_names, feature_importances))
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)
for name, importance in feature_importance_pairs[:10]:
    print(f"  {name}: {importance:.4f}")


In [None]:
# Apply PCA
print("=== Principal Component Analysis (PCA) ===")

pca = PCA(n_components=15)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {np.sum(pca.explained_variance_ratio_):.4f}")

# Plot explained variance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), 
        pca.explained_variance_ratio_)
plt.title('Explained Variance by Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')

plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% variance')
plt.legend()

plt.tight_layout()
plt.show()

# Find number of components for 95% variance
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
n_components_95 = np.argmax(cumsum_variance >= 0.95) + 1
print(f"\nNumber of components needed for 95% variance: {n_components_95}")
