# Feature Engineering: The Art of Creating Better Features

> **"Feature engineering is where domain expertise meets machine learning."**

## Learning Objectives
- Master advanced feature engineering techniques
- Learn about feature selection and dimensionality reduction
- Understand handling of different data types (numerical, categorical, text)
- Implement automated feature engineering pipelines
- Apply feature engineering to real-world datasets


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("Libraries imported successfully!")


## 1. Feature Engineering Fundamentals

### What is Feature Engineering?
Feature engineering is the process of creating new features or transforming existing ones to improve machine learning model performance. It's often considered the most important step in the machine learning pipeline.

### Key Techniques

#### 1. Feature Scaling
- **Standardization**: (x - μ) / σ
- **Min-Max Scaling**: (x - min) / (max - min)
- **Robust Scaling**: (x - median) / IQR

#### 2. Feature Selection
- **Filter Methods**: Statistical tests, correlation analysis
- **Wrapper Methods**: Forward/backward selection, recursive feature elimination
- **Embedded Methods**: L1 regularization, tree-based feature importance

#### 3. Feature Creation
- **Polynomial Features**: x², x³, interactions
- **Binning**: Convert continuous to categorical
- **Domain Knowledge**: Create features based on business logic


In [None]:
# Generate sample dataset for feature engineering
np.random.seed(42)

# Create synthetic dataset
n_samples = 1000
n_features = 10

# Generate features
X = np.random.randn(n_samples, n_features)

# Add some feature names
feature_names = [f'feature_{i}' for i in range(n_features)]

# Create target variable with some relationship to features
# y = 2*feature_0 + 1.5*feature_1 - 0.8*feature_2 + noise
true_coeffs = np.array([2.0, 1.5, -0.8, 0, 0, 0, 0, 0, 0, 0])  # Only first 3 features matter
y = X @ true_coeffs + np.random.normal(0, 0.5, n_samples)

# Convert to binary classification
y_binary = (y > np.median(y)).astype(int)

# Add some missing values (5% missing)
missing_mask = np.random.random((n_samples, n_features)) < 0.05
X[missing_mask] = np.nan

# Add some outliers (2% of samples)
outlier_mask = np.random.random(n_samples) < 0.02
X[outlier_mask, 0] += np.random.normal(0, 5, np.sum(outlier_mask))

# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y_binary

print("Dataset Overview:")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Missing values per column:")
print(df.isnull().sum())
print(f"\nTarget distribution:")
print(df['target'].value_counts())

# Visualize the data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
axes[0, 0].hist(df['target'], bins=2, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Target Distribution')
axes[0, 0].set_xlabel('Target')
axes[0, 0].set_ylabel('Frequency')

# Feature correlations
correlation_matrix = df.corr()
im = axes[0, 1].imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
axes[0, 1].set_title('Feature Correlation Matrix')
axes[0, 1].set_xticks(range(len(feature_names)))
axes[0, 1].set_yticks(range(len(feature_names)))
axes[0, 1].set_xticklabels(feature_names, rotation=45)
axes[0, 1].set_yticklabels(feature_names)
plt.colorbar(im, ax=axes[0, 1])

# Missing values heatmap
missing_data = df.isnull()
axes[1, 0].imshow(missing_data.T, cmap='viridis', aspect='auto')
axes[1, 0].set_title('Missing Values Pattern')
axes[1, 0].set_xlabel('Sample Index')
axes[1, 0].set_ylabel('Features')

# Feature distributions
df.iloc[:, :5].boxplot(ax=axes[1, 1])
axes[1, 1].set_title('Feature Distributions (First 5)')
axes[1, 1].set_ylabel('Value')

plt.tight_layout()
plt.show()


In [None]:
# Feature Engineering Pipeline
print("Feature Engineering Pipeline:")
print("=" * 50)

# Step 1: Handle Missing Values
print("Step 1: Handling Missing Values")
print("-" * 30)

# Check missing values before imputation
print("Missing values before imputation:")
print(df.isnull().sum())

# Impute missing values with median
from sklearn.impute import SimpleImputer

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Convert back to DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=feature_names)

print(f"\nMissing values after imputation:")
print(X_imputed_df.isnull().sum().sum())

# Step 2: Feature Scaling
print("\nStep 2: Feature Scaling")
print("-" * 30)

# Compare different scaling methods
scalers = {
    'Original': None,
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

scaled_data = {}
for name, scaler in scalers.items():
    if scaler is None:
        scaled_data[name] = X_imputed
    else:
        scaled_data[name] = scaler.fit_transform(X_imputed)

# Visualize scaling effects
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (name, data) in enumerate(scaled_data.items()):
    axes[i].boxplot(data[:, :5], labels=feature_names[:5])
    axes[i].set_title(f'{name} Scaling')
    axes[i].set_ylabel('Value')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Step 3: Feature Selection
print("\nStep 3: Feature Selection")
print("-" * 30)

# Use StandardScaler for feature selection
X_scaled = StandardScaler().fit_transform(X_imputed)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Feature selection methods
selection_methods = {
    'F-test': SelectKBest(f_classif, k=5),
    'Mutual Information': SelectKBest(mutual_info_classif, k=5)
}

selected_features = {}
for name, selector in selection_methods.items():
    X_selected = selector.fit_transform(X_train, y_train)
    selected_features[name] = X_selected
    
    # Get selected feature indices
    selected_indices = selector.get_support(indices=True)
    selected_feature_names = [feature_names[i] for i in selected_indices]
    
    print(f"{name} selected features: {selected_feature_names}")

# Step 4: Feature Creation
print("\nStep 4: Feature Creation")
print("-" * 30)

# Create polynomial features
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

print(f"Original features: {X_scaled.shape[1]}")
print(f"Polynomial features: {X_poly.shape[1]}")

# Create interaction features
X_interaction = X_scaled.copy()
for i in range(X_scaled.shape[1]):
    for j in range(i+1, X_scaled.shape[1]):
        interaction_feature = X_scaled[:, i] * X_scaled[:, j]
        X_interaction = np.column_stack([X_interaction, interaction_feature])

print(f"With interaction features: {X_interaction.shape[1]}")

# Step 5: Model Performance Comparison
print("\nStep 5: Model Performance Comparison")
print("-" * 30)

# Train models on different feature sets
models = {
    'Original': X_scaled,
    'F-test Selected': selected_features['F-test'],
    'Mutual Info Selected': selected_features['Mutual Information'],
    'Polynomial': X_poly,
    'With Interactions': X_interaction
}

results = {}
for name, X_data in models.items():
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = accuracy
    print(f"{name}: {accuracy:.3f}")

# Visualize results
plt.figure(figsize=(12, 6))
names = list(results.keys())
accuracies = list(results.values())

bars = plt.bar(names, accuracies, alpha=0.7)
plt.xlabel('Feature Set')
plt.ylabel('Accuracy')
plt.title('Model Performance with Different Feature Sets')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{accuracy:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()
