# Feature Engineering Basics

Learn to prepare data for machine learning.

## Table of Contents
1. [Feature Scaling](#scaling)
2. [Encoding Categorical Variables](#encoding)
3. [Handling Missing Values](#missing)
4. [Feature Selection](#selection)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt
import seaborn as sns

## Feature Scaling <a id='scaling'></a>

In [None]:
# Sample data
data = np.array([[1, 100], [2, 200], [3, 300], [4, 400], [5, 500]])
print("Original data:")
print(data)

# Standardization (Z-score)
scaler = StandardScaler()
standardized = scaler.fit_transform(data)
print("\nStandardized:")
print(standardized)

# Min-Max Scaling
minmax = MinMaxScaler()
normalized = minmax.fit_transform(data)
print("\nMin-Max scaled:")
print(normalized)

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].scatter(data[:, 0], data[:, 1])
axes[0].set_title('Original')
axes[1].scatter(standardized[:, 0], standardized[:, 1])
axes[1].set_title('Standardized')
axes[2].scatter(normalized[:, 0], normalized[:, 1])
axes[2].set_title('Normalized')
plt.tight_layout()
plt.show()

## Encoding Categorical Variables <a id='encoding'></a>

In [None]:
# Sample data
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'Size': ['S', 'M', 'L', 'M', 'S'],
    'Price': [10, 20, 30, 15, 25]
})
print("Original data:")
print(df)

# Label Encoding
le = LabelEncoder()
df['Color_Label'] = le.fit_transform(df['Color'])
print("\nWith Label Encoding:")
print(df)

# One-Hot Encoding
df_encoded = pd.get_dummies(df[['Color', 'Size']], drop_first=True)
print("\nOne-Hot Encoded:")
print(df_encoded)

## Handling Missing Values <a id='missing'></a>

In [None]:
# Create data with missing values
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, 30, 40, 50],
    'C': ['x', 'y', 'z', np.nan, 'y']
})
print("Data with missing values:")
print(df)
print("\nMissing value counts:")
print(df.isnull().sum())

# Impute with mean
imputer = SimpleImputer(strategy='mean')
df_imputed = df.copy()
df_imputed[['A', 'B']] = imputer.fit_transform(df[['A', 'B']])
print("\nAfter mean imputation:")
print(df_imputed)

# Impute categorical with most frequent
imputer_cat = SimpleImputer(strategy='most_frequent')
df_imputed['C'] = imputer_cat.fit_transform(df[['C']])
print("\nAfter categorical imputation:")
print(df_imputed)

## Feature Selection <a id='selection'></a>

In [None]:
# Generate sample data
np.random.seed(42)
X = np.random.rand(100, 5)
y = (X[:, 0] + X[:, 1] > 1).astype(int)  # Target depends on first two features

print("Original features shape:", X.shape)

# Select top k features
selector = SelectKBest(chi2, k=3)
X_new = selector.fit_transform(X, y)

print("Selected features shape:", X_new.shape)
print("Feature scores:", selector.scores_)
print("Selected feature indices:", selector.get_support(indices=True))

## Summary

Covered:
- Feature scaling (standardization, normalization)
- Encoding categorical variables
- Handling missing values
- Feature selection

Next: **Machine Learning** algorithms!