## 5. IRRELEVANT FEATURES

### Definition
**Garbage IN → Garbage OUT**: Including irrelevant features hurts model performance.

### Why Irrelevant Features Are Bad:

```
1. Noise introduction: Model confuses signal with noise
2. Overfitting risk: Model learns random patterns
3. Computational cost: Training slower, higher memory
4. Interpretability: Hard to explain predictions
5. Curse of dimensionality: Performance degrades in high dimensions
```

### Example: Irrelevant Feature Problem


In [None]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Create synthetic data
X, y = make_regression(n_samples=200, n_features=10, n_informative=5, noise=10)

# Add completely irrelevant (random noise) features
n_irrelevant = [0, 5, 10, 20, 50]
scores = []

for n_irrel in n_irrelevant:
    # Add random noise features
    X_with_noise = np.column_stack([
        X,
        np.random.randn(200, n_irrel)
    ])
    
    # Train model
    model = RandomForestRegressor(n_estimators=50)
    score = cross_val_score(model, X_with_noise, y, cv=5).mean()
    scores.append(score)
    
    print(f"Features: {X_with_noise.shape[1]}, Score: {score:.3f}")

# Plot: More irrelevant features = worse performance!
plt.figure(figsize=(10, 6))
plt.plot(n_irrelevant, scores, marker='o', linewidth=2)
plt.xlabel('Number of Irrelevant Features')
plt.ylabel('Model R² Score')
plt.title('Effect of Irrelevant Features on Model Performance')
plt.grid(True, alpha=0.3)
plt.show()


### Feature Selection Methods:

#### 1. **Filter Methods** (Fast, independent of model)


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.datasets import load_breast_cancer

# Load data
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

print(f"Original features: {X.shape[1]}")

# Method 1: Statistical tests (ANOVA F-score)
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

print(f"After SelectKBest: {X_selected.shape[1]}")

# See which features were selected
selected_features = X.columns[selector.get_support()]
print("\nSelected features:")
print(selected_features.tolist())

# Method 2: Mutual information
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected_mi = selector_mi.fit_transform(X, y)

# Method 3: Correlation with target
correlations = X.corrwith(y).abs().sort_values(ascending=False)
print("\nTop 10 correlated features:")
print(correlations.head(10))

top_features = correlations.head(10).index
X_filtered = X[top_features]


#### 2. **Wrapper Methods** (Uses model performance to select)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Recursive Feature Elimination
model = RandomForestClassifier(n_estimators=50)
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

print("RFE Selected features:")
selected = X.columns[rfe.support_]
print(selected.tolist())

# Forward/Backward selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs = SFS(
    model,
    k_features=10,
    forward=True,  # Forward (start with empty), False for backward
    verbose=1,
    n_jobs=-1
)

sfs.fit(X, y)
print("\nSequential selection features:")
print(list(sfs.k_feature_names_))


#### 3. **Embedded Methods** (Features selected during training)


In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Tree-based feature importance
model = RandomForestClassifier(n_estimators=100)
model.fit(X, y)

# Get feature importances
importances = pd.DataFrame({
    'feature': cancer.feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 important features:")
print(importances.head(10))

# Select features above threshold
selector = SelectFromModel(model, prefit=True, threshold='median')
X_selected = selector.transform(X)

print(f"Features selected: {X_selected.shape[1]}/{X.shape[1]}")

# L1 regularization (Lasso)
from sklearn.linear_model import LogisticRegression

l1_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
l1_model.fit(X, y)

# Non-zero coefficients = selected features
selected_l1 = X.columns[l1_model.coef_[0] != 0]
print(f"\nL1 selected: {len(selected_l1)} features")


#### 4. **Domain Expertise**


In [None]:
# Sometimes simple domain knowledge beats automated selection!

# Example: Predicting house prices
all_features = [
    'square_feet',      # ✅ Obviously important
    'bedrooms',         # ✅ Obviously important
    'bathrooms',        # ✅ Obviously important
    'location',         # ✅ Obviously important
    'color',            # ❌ Irrelevant
    'owner_height',     # ❌ Irrelevant
    'owner_age',        # ❌ Irrelevant (generally)
    'year_built',       # ✅ Important
]

# Use domain expertise to select
important_features = [
    'square_feet',
    'bedrooms',
    'bathrooms',
    'location',
    'year_built'
]

X_selected = X[important_features]


---
