```python
# Hyperparameter Tuning for Machine Learning Classifiers
# This notebook demonstrates methods for optimizing classifier performance through hyperparameter tuning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
```

# Introduction to Hyperparameter Tuning

Hyperparameters are model configuration settings used to control the learning process. Unlike model parameters that are learned during training, hyperparameters must be set before training begins. Proper tuning can significantly improve model performance.

Key concepts:
1. Cross-validation: Evaluating model performance on different data splits
2. Grid Search: Systematic search through specified parameter values
3. Random Search: Sampling from parameter distributions
4. Validation curves: Visualizing model performance vs. hyperparameters

```python
# Generate sample dataset (similar to previous lesson)
np.random.seed(42)
n_samples = 300

# Create more complex synthetic data with 3 features
X1 = np.random.normal(loc=[8, 2, 3], scale=[1, 1, 1], size=(n_samples//2, 3))
X2 = np.random.normal(loc=[2, 8, 6], scale=[1, 1, 1], size=(n_samples//2, 3))

X = np.vstack([X1, X2])
y = np.hstack([np.zeros(n_samples//2), np.ones(n_samples//2)])

# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
```

# Cross-Validation

Cross-validation helps assess model performance more robustly than a single train-test split.

```python
# Demonstrate cross-validation with KNN
knn = KNeighborsClassifier(n_neighbors=5)
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())
```

# Grid Search

Grid Search exhaustively searches through a specified parameter grid to find the best combination.

```python
# Grid Search for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search_knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_knn.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search_knn.best_params_)
print("Best cross-validation score:", grid_search_knn.best_score_)
```

# Random Search

Random Search samples from parameter distributions, often finding good parameters more efficiently than Grid Search.

```python
# Random Search for SVM
param_distributions = {
    'C': np.logspace(-3, 3, 1000),
    'kernel': ['rbf', 'linear'],
    'gamma': np.logspace(-3, 3, 1000)
}

random_search_svm = RandomizedSearchCV(
    SVC(),
    param_distributions,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search_svm.fit(X_train_scaled, y_train)

print("Best parameters:", random_search_svm.best_params_)
print("Best cross-validation score:", random_search_svm.best_score_)
```

# Validation Curves

Validation curves help visualize how model performance changes with hyperparameter values.

```python
def plot_validation_curve(param_name, param_range, model, X, y, cv=5):
    train_scores, test_scores = np.zeros((cv, len(param_range))), np.zeros((cv, len(param_range)))
    
    for i, param_value in enumerate(param_range):
        # Set parameter and perform cross-validation
        model.set_params(**{param_name: param_value})
        cv_scores = cross_val_score(model, X, y, cv=cv)
        test_scores[:, i] = cv_scores
    
    plt.figure(figsize=(10, 6))
    plt.plot(param_range, test_scores.mean(axis=0), label='Cross-validation score')
    plt.fill_between(param_range, 
                     test_scores.mean(axis=0) - test_scores.std(axis=0),
                     test_scores.mean(axis=0) + test_scores.std(axis=0), 
                     alpha=0.2)
    plt.xlabel(param_name)
    plt.ylabel('Accuracy')
    plt.title(f'Validation Curve for {param_name}')
    plt.legend()
    plt.show()

# Example: Validation curve for KNN n_neighbors
n_neighbors_range = np.arange(1, 31, 2)
plot_validation_curve('n_neighbors', n_neighbors_range, KNeighborsClassifier(), 
                     X_train_scaled, y_train)
```

# Random Forest Parameter Tuning

Random Forests have several important hyperparameters to tune.

```python
# Grid Search for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_rf.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search_rf.best_params_)
print("Best cross-validation score:", grid_search_rf.best_score_)
```

# Comparing Tuned Models

```python
# Compare base models vs tuned models
models = {
    'Base KNN': KNeighborsClassifier(),
    'Tuned KNN': grid_search_knn.best_estimator_,
    'Base SVM': SVC(),
    'Tuned SVM': random_search_svm.best_estimator_,
    'Base RF': RandomForestClassifier(),
    'Tuned RF': grid_search_rf.best_estimator_
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    results[name] = {'Train Score': train_score, 'Test Score': test_score}

results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)
```

# Tips for Hyperparameter Tuning

1. Start with broad parameter ranges and refine
2. Use RandomizedSearchCV for large parameter spaces
3. Consider computational cost vs potential improvement
4. Watch for overfitting (large gap between training and validation scores)
5. Use domain knowledge to guide parameter selection

```python
# Example of parameter refinement
# After finding approximate good values, we can search more finely around them
refined_param_grid = {
    'n_neighbors': [grid_search_knn.best_params_['n_neighbors'] - 1, grid_search_knn.best_params_['n_neighbors'], grid_search_knn.best_params_['n_neighbors'] + 1],
    'weights': [grid_search_knn.best_params_['weights']],
    'metric': [grid_search_knn.best_params_['metric']]
}

refined_grid_search_knn = GridSearchCV(
    KNeighborsClassifier(),
    refined_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

refined_grid_search_knn.fit(X_train_scaled, y_train)

print("Refined best parameters:", refined_grid_search_knn.best_params_)
print("Refined best cross-validation score:", refined_grid_search_knn.best_score_)
```


```python
# %% [markdown]
# # Hyperparameter Tuning for Machine Learning Classifiers
# 
# **Objective**: Optimize classifier performance through systematic hyperparameter tuning.
# 
# **Key Concepts**:
# - **Hyperparameters**: User-defined settings controlling model behavior (e.g., tree depth, regularization strength).
# - **Overfitting vs Underfitting**: Balance model complexity to avoid memorizing noise (overfitting) or missing patterns (underfitting).
# - **Cross-Validation**: Robust evaluation using data subsets (critical for small planetary science datasets).
# 
# **Tuning Methods**:
# 1. Grid Search
# 2. Randomized Search
# 3. Validation Curves
# 
# ---

# %% [markdown]
# ## Setup
# Add tuning-specific imports

# %%
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, validation_curve

# %% [markdown]
# ## Data Preparation
# Use smaller subset to simulate realistic planetary data limitations

# %%
# Use 70% of original data to create "scarce" planetary dataset
X_sub, _, y_sub, _ = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y
)
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
    X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
)

# Scale features (only for non-tree-based models)
X_train_sub_scaled = scaler.fit_transform(X_train_sub)
X_test_sub_scaled = scaler.transform(X_test_sub)

# %% [markdown]
# ## Tuning Method 1: Grid Search (Logistic Regression)
# - **Analog**: Testing all combinations of microscope settings
# - **Parameters**: Regularization strength (`C`), penalty type (`l1/l2`)

# %%
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # Only solver supporting both penalties
}

lr_grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid,
    cv=5,
    scoring='accuracy'
)
lr_grid.fit(X_train_sub_scaled, y_train_sub)

print(f"Best parameters: {lr_grid.best_params_}")
print(f"Validation accuracy: {lr_grid.best_score_:.2f}")
print(f"Test accuracy: {accuracy_score(y_test_sub, lr_grid.predict(X_test_sub_scaled)):.2f}")

# %% [markdown]
# ## Tuning Method 2: Randomized Search (k-NN)
# - **Analog**: Randomly sampling telescope configuration parameters
# - **Parameters**: Neighbors (`n_neighbors`), distance weighting, metric

# %%
param_dist = {
    'n_neighbors': np.arange(3, 15),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Manhattan (p=1) vs Euclidean (p=2)
}

knn_random = RandomizedSearchCV(
    KNeighborsClassifier(),
    param_dist,
    n_iter=20,
    cv=5,
    random_state=42
)
knn_random.fit(X_train_sub_scaled, y_train_sub)

print(f"Best parameters: {knn_random.best_params_}")
print(f"Validation accuracy: {knn_random.best_score_:.2f}")
print(f"Test accuracy: {accuracy_score(y_test_sub, knn_random.predict(X_test_sub_scaled)):.2f}")

# %% [markdown]
# ## Tuning Method 3: SVM with RBF Kernel
# - **Critical Parameters**: `C` (misclassification penalty), `gamma` (kernel width)
# - **Geoscience Use**: Optimizing mineral boundary detection in hyperspectral data

# %%
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
}

svm_grid = GridSearchCV(
    SVC(kernel='rbf'),
    param_grid_svm,
    cv=5,
    scoring='accuracy'
)
svm_grid.fit(X_train_sub_scaled, y_train_sub)

print(f"Best parameters: {svm_grid.best_params_}")
print(f"Validation accuracy: {svm_grid.best_score_:.2f}")
print(f"Test accuracy: {accuracy_score(y_test_sub, svm_grid.predict(X_test_sub_scaled)):.2f}")

# %% [markdown]
# ## Tuning Method 4: Random Forest
# - **Key Parameters**: `n_estimators` (number of trees), `max_depth` (tree complexity)
# - **Planetary Example**: Optimizing crater counting algorithms

# %%
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=5,
    scoring='accuracy'
)
rf_grid.fit(X_train_sub, y_train_sub)  # No scaling for trees

print(f"Best parameters: {rf_grid.best_params_}")
print(f"Validation accuracy: {rf_grid.best_score_:.2f}")
print(f"Test accuracy: {accuracy_score(y_test_sub, rf_grid.predict(X_test_sub)):.2f}")

# %% [markdown]
# ## Diagnostic Tool: Validation Curves
# Visualize parameter sensitivity for SVM's `C`

# %%
C_range = np.logspace(-2, 3, 6)
train_scores, val_scores = validation_curve(
    SVC(gamma='auto'),
    X_train_sub_scaled,
    y_train_sub,
    param_name='C',
    param_range=C_range,
    cv=5,
    scoring='accuracy'
)

plt.figure()
plt.semilogx(C_range, np.mean(train_scores, 1), label='Training')
plt.semilogx(C_range, np.mean(val_scores, 1), label='Validation')
plt.xlabel('C (Regularization Strength)')
plt.ylabel('Accuracy')
plt.legend()
plt.title('SVM Complexity vs Performance')
plt.show()

# %% [markdown]
# ## Tuned Model Comparison

# %%
tuned_models = {
    'LR (Tuned)': lr_grid,
    'k-NN (Tuned)': knn_random,
    'SVM (Tuned)': svm_grid,
    'RF (Tuned)': rf_grid
}

results = []
for name, model in tuned_models.items():
    if 'RF' in name:
        pred = model.predict(X_test_sub)
    else:
        pred = model.predict(X_test_sub_scaled)
    acc = accuracy_score(y_test_sub, pred)
    results.append((name, acc))

pd.DataFrame(results, columns=["Model", "Accuracy"]).sort_values("Accuracy", ascending=False)

# %% [markdown]
# ## Key Takeaways
# - **Grid Search**: Exhaustive but computationally expensive - use for <5 parameters
# - **Randomized Search**: Efficient for large parameter spaces - better for initial exploration
# - **Validation Curves**: Diagnose under/overfitting by varying single parameters
# - **Planetary Data Considerations**:
#   - Prioritize simpler models when data is limited
#   - Always use cross-validation with small datasets
#   - Start with default parameters before tuning
# 
# **Next Steps**: Feature importance analysis, automated hyperparameter optimization (Bayesian methods).