# 🧪 Custom Grid Search

In this notebook, we implement **Grid Search with cross-validation** from scratch using **a custom class `MyGridSearchCV`**. We then compare the performance of these implementations with **scikit-learn**'s `GridSearchCV` model.

### ⚙️ Importing Libraries & Environment Setup

In [333]:
from itertools import product
from typing import Any

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.base import BaseEstimator, clone
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (
    BaseCrossValidator,
    GridSearchCV,
    KFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.svm import SVC

In [334]:
%matplotlib inline

pd.set_option("display.width", 150)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)

### 📥 Loading the Dataset

In [335]:
# Load iris dataset
X, y = load_iris(return_X_y=True)

In [336]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### 🧠 Implementing Custom Model Algorithms

In [None]:
class MyGridSearchCV:
    """Custom implementation of Grid Search with cross-validation.

    Exhaustively searches over specified hyperparameter combinations
    to find the best model based on cross-validated accuracy score.

    Attributes:
        estimator (BaseEstimator): Model implementing fit and predict.
        param_grid (dict[str, Any]): Hyperparameters and their values to search.
        cv (int | BaseCrossValidator): Number of folds or CV splitting strategy.
        best_estimator_ (BaseEstimator | None): Estimator fitted on full data with
            best params.
        best_params_ (dict[str, Any] | None): Best hyperparameter combination found.
        best_score_ (float): Best mean cross-validation accuracy score.
    """

    def __init__(
        self,
        estimator: BaseEstimator,
        param_grid: dict[str, Any],
        cv: int | BaseCrossValidator = 5,
    ) -> None:
        """Initialize MyGridSearchCV with estimator, parameter grid and CV strategy.

        Args:
            estimator (BaseEstimator): Model with fit and predict methods.
            param_grid (dict[str, Any]): Dictionary of hyperparameters to try.
            cv (int | BaseCrossValidator, optional): Number of folds or CV splitter.
                Defaults to 5.
        """
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv

        self.best_estimator_: BaseEstimator | None = None
        self.best_params_: dict[str, Any] | None = None
        self.best_score_: float = float("-inf")

    def fit(self, X: NDArray[np.float64], y: NDArray[np.int64]) -> None:
        """Perform grid search with cross-validation on the training data.

        Iterates over all hyperparameter combinations, evaluates each using
        cross-validation, and stores the best model and parameters.

        Args:
            X (NDArray[np.float64]): Feature matrix of shape (n_samples, n_features).
            y (NDArray[np.int64]): Target vector of shape (n_samples,).
        """
        param_keys = list(self.param_grid.keys())
        param_values = list(self.param_grid.values())
        param_combinations = list(product(*param_values))

        if isinstance(self.cv, int):
            cv = KFold(n_splits=self.cv, shuffle=True, random_state=42)
        else:
            cv = self.cv

        for values in param_combinations:
            params = dict(zip(param_keys, values, strict=False))
            estimator = clone(self.estimator)
            estimator.set_params(**params)

            scores = []
            for train_idx, valid_idx in cv.split(X, y):
                X_train, X_test = X[train_idx], X[valid_idx]
                y_train, y_test = y[train_idx], y[valid_idx]

                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)

                score = accuracy_score(y_test, y_pred)
                scores.append(score)

            avg_score = np.mean(scores)
            if self.best_score_ < avg_score:
                self.best_score_ = avg_score
                self.best_params_ = params
                self.best_estimator_ = estimator
                self.best_estimator_.fit(X, y)

    def predict(self, X: NDArray[np.float64]) -> NDArray[np.int64]:
        """Predict target labels using the best found estimator.

        Args:
            X (NDArray[np.float64]): Feature matrix to predict.

        Raises:
            ValueError: If fit has not been called yet.

        Returns:
            NDArray[np.int64]: Predicted labels.
        """
        if self.best_estimator_ is None:
            raise ValueError("fit() must be called before predict()")
        return self.best_estimator_.predict(X)

    def score(self, X: NDArray[np.float64], y: NDArray[np.int64]) -> float:
        """Calculate accuracy score of the best estimator on given data.

        Args:
            X (NDArray[np.float64]): Feature matrix.
            y (NDArray[np.int64]): True target labels.

        Raises:
            ValueError: If fit has not been called yet.

        Returns:
            float: Accuracy score.
        """
        if self.best_estimator_ is None:
            raise ValueError("fit() must be called before score()")
        return self.best_estimator_.score(X, y)

### 🏋️‍♂️ Model Training

In [338]:
# Base model
model = SVC()

# Hyperparameter grid
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scikit-learn GridSearchCV
sklearn_gs = GridSearchCV(
    estimator=model, param_grid=param_grid, scoring="accuracy", cv=cv
)
sklearn_gs.fit(X_train, y_train)

# My GridSearchCV
my_gs = MyGridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
my_gs.fit(X_train, y_train)

### 📊 Comparing Algorithm Versions

In [None]:
grid_search_algorithms = {"sklearn": sklearn_gs, "custom": my_gs}

columns = list(param_grid.keys()) + ["best_cv_score", "test_score"]
performance_summary = pd.DataFrame(index=grid_search_algorithms.keys(), columns=columns)

for model_name, grid_search_instance in grid_search_algorithms.items():
    best_params_and_scores = grid_search_instance.best_params_.copy()
    best_params_and_scores["best_cv_score"] = grid_search_instance.best_score_
    best_params_and_scores["test_score"] = grid_search_instance.score(X_test, y_test)

    performance_summary.loc[model_name] = pd.Series(best_params_and_scores)

performance_summary

Unnamed: 0,C,kernel,gamma,best_cv_score,test_score
sklearn,10,rbf,scale,0.966667,1.0
custom,10,rbf,scale,0.966667,1.0
