# üß™ Custom Random Search

In this notebook, we implement **Random Search with cross-validation** from scratch using **a custom class `MyRandomSearchCV`**. We then compare the performance of these implementations with **scikit-learn**'s `RandomSearchCV` model.

### ‚öôÔ∏è Importing Libraries & Environment Setup

In [None]:
from typing import Any

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.base import BaseEstimator, clone
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (
    BaseCrossValidator,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.svm import SVC

In [2]:
%matplotlib inline

pd.set_option("display.width", 150)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)

### üì• Loading the Dataset

In [3]:
# Load iris dataset
X, y = load_iris(return_X_y=True)

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### üß† Implementing Custom Model Algorithms

In [None]:
class MyRandomizedSearchCV:
    """Randomized search with cross-validation for hyperparameter optimization.

    Randomly samples hyperparameter combinations from the specified grid
    and evaluates each using cross-validation to find the best model
    based on accuracy score.

    Attributes:
        estimator (BaseEstimator): The base model implementing fit and predict.
        param_distributions (dict[str, list[Any]]): Hyperparameters and their
            candidate values.
        n_iter (int): Number of random hyperparameter combinations to sample.
        cv (int | BaseCrossValidator): Number of folds or cross-validation strategy.
        best_estimator_ (BaseEstimator | None): Estimator fitted on full data with
            best params.
        best_params_ (dict[str, Any] | None): Best hyperparameter combination found.
        best_score_ (float): Highest mean cross-validation accuracy score achieved.
    """

    def __init__(
        self,
        estimator: BaseEstimator,
        param_distributions: dict[str, list[Any]],
        n_iter: int,
        cv: int | BaseCrossValidator = 5,
        random_state: int | None = None,
    ) -> None:
        """Initialize the randomized search object.

        Args:
            estimator (BaseEstimator): Base model with fit and predict methods.
            param_distributions (dict[str, list[Any]]): Dictionary specifying
                hyperparameters and the list of values to sample from.
            n_iter (int): Number of random parameter combinations to evaluate.
            cv (int | BaseCrossValidator, optional): Number of folds or CV splitter.
                Defaults to 5.
            random_state (int | None, optional): Random seed for reproducibility.
                Defaults to None.
        """
        self.estimator = estimator
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.cv = cv
        self.rng = np.random.default_rng(random_state)

        self.best_estimator_: BaseEstimator | None = None
        self.best_params_: dict[str, Any] | None = None
        self.best_score_: float = float("-inf")

    def fit(self, X: NDArray[np.float64], y: NDArray[np.int64 | np.float64]) -> None:
        """Run randomized search with cross-validation on the training data.

        Samples random hyperparameter combinations, evaluates each via CV,
        and stores the best model and corresponding parameters.

        Args:
            X (NDArray[np.float64]): Feature matrix of shape (n_samples, n_features).
            y (NDArray[np.int64 | np.float64]): Target vector of shape (n_samples,).
        """
        if isinstance(self.cv, int):
            cv = KFold(n_splits=self.cv, shuffle=True, random_state=42)
        else:
            cv = self.cv

        used_params = []
        iterations = self.n_iter
        while iterations > 0:
            params = {
                k: self.rng.choice(v) for k, v in self.param_distributions.items()
            }
            if params in used_params:
                continue

            estimator = clone(self.estimator)
            estimator.set_params(**params)

            scores = []
            for train_idx, valid_idx in cv.split(X, y):
                X_train, X_test = X[train_idx], X[valid_idx]
                y_train, y_test = y[train_idx], y[valid_idx]

                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)

                score = accuracy_score(y_test, y_pred)
                scores.append(score)

            avg_score = np.mean(scores)
            if self.best_score_ < avg_score:
                self.best_score_ = avg_score
                self.best_params_ = params
                self.best_estimator_ = estimator
                self.best_estimator_.fit(X, y)

            iterations -= 1
            used_params.append(params)

    def predict(self, X: NDArray[np.float64]) -> NDArray[np.int64 | np.float64]:
        """Predict target values using the best found estimator.

        Args:
            X (NDArray[np.float64]): Feature matrix for prediction.

        Raises:
            ValueError: If fit() has not been called yet.

        Returns:
            NDArray[np.int64 | np.float64]: Predicted target values.
        """
        if self.best_estimator_ is None:
            raise ValueError("fit() must be called before predict()")
        return self.best_estimator_.predict(X)

    def score(self, X: NDArray[np.float64], y: NDArray[np.int64 | np.float64]) -> float:
        """Compute accuracy of the best estimator on the given data.

        Args:
            X (NDArray[np.float64]): Feature matrix.
            y (NDArray[np.int64]): True labels.

        Raises:
            ValueError: If fit() has not been called yet.

        Returns:
            float: Accuracy score.
        """
        if self.best_estimator_ is None:
            raise ValueError("fit() must be called before score()")
        return self.best_estimator_.score(X, y)

### üèãÔ∏è‚Äç‚ôÇÔ∏è Model Training

In [None]:
# Base model
model = SVC()

# Hyperparameter grid
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scikit-learn RandomizedSearchCV
sklearn_rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=6,
    scoring="accuracy",
    cv=cv,
    random_state=42,
)
sklearn_rs.fit(X_train, y_train)

# My RandomizedSearchCV
my_rs = MyRandomizedSearchCV(
    estimator=model, param_distributions=param_grid, n_iter=6, cv=cv, random_state=42
)
my_rs.fit(X_train, y_train)

### üìä Comparing Algorithm Versions

In [None]:
random_search_algorithms = {"sklearn": sklearn_rs, "custom": my_rs}

columns = list(param_grid.keys()) + ["best_cv_score", "test_score"]
performance_summary = pd.DataFrame(
    index=random_search_algorithms.keys(), columns=columns
)

for model_name, random_search_instance in random_search_algorithms.items():
    best_params_and_scores = random_search_instance.best_params_.copy()
    best_params_and_scores["best_cv_score"] = random_search_instance.best_score_
    best_params_and_scores["test_score"] = random_search_instance.score(X_test, y_test)

    performance_summary.loc[model_name] = pd.Series(best_params_and_scores)

performance_summary

Unnamed: 0,C,kernel,gamma,best_cv_score,test_score
sklearn,10.0,rbf,scale,0.966667,1.0
custom,10.0,rbf,scale,0.966667,1.0
