Including a data pre-processing method <br>
Class inputs changed to whole dataset instead of x_train and y_train <br>
<p>
Preprocessing includes checking for nan-values
filling nans with imputer
checking for 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import mode

pre-process method <br>
parameters - dataset, target name, test_split = 0.2 (default), sampling = False,
pipeline:
missing values<br>
train test split <br>



In [None]:
class HyperParamClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, param_grid, random_state=None, voting='hard'):
        
        self.param_grid = param_grid
        self.random_state = random_state
        self.voting = voting
        self.estimators_ = []  # to store the trained models

    def _bootstrap_sample(self, X, y):
        
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X, y, n_estimators=5):
        
        # Set random seed if provided for reproducibility.
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        # Generate all possible hyperparameter combinations.
        all_param_combos = list(ParameterGrid(self.param_grid))
        
        # If there are fewer combinations than needed, sample with replacement;
        # otherwise, shuffle and take the first n_estimators.
        if len(all_param_combos) < n_estimators:
            chosen_combos = [all_param_combos[i] for i in 
                             np.random.randint(0, len(all_param_combos), size=n_estimators)]
        else:
            np.random.shuffle(all_param_combos)
            chosen_combos = all_param_combos[:n_estimators]
        
        self.estimators_ = []
        for params in chosen_combos:
            # Get a bootstrap sample from the data.
            X_sample, y_sample = self._bootstrap_sample(X, y)
            # Initialize and train the decision tree on the bootstrap sample.
            model = DecisionTreeClassifier(**params)
            model.fit(X_sample, y_sample)
            self.estimators_.append(model)
        
        return self

    def predict(self, X):
        if self.voting == 'hard':
            # Get predictions from each estimator (shape: [n_estimators, n_samples])
            predictions = np.array([estimator.predict(X) for estimator in self.estimators_])
            # Majority vote: compute the mode along the estimator axis and flatten to 1D.
            majority_votes = mode(predictions, axis=0).mode.flatten()
            return majority_votes
        elif self.voting == 'soft':
            # For soft voting, average predicted probabilities and take argmax.
            avg_probas = self.predict_proba(X)
            return np.argmax(avg_probas, axis=1)
        else:
            raise ValueError("voting parameter must be either 'hard' or 'soft'.")

    def predict_proba(self, X):
        
        probas = np.array([estimator.predict_proba(X) for estimator in self.estimators_])
        avg_probas = np.mean(probas, axis=0)
        return avg_probas


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# Load dataset (with categorical columns)
data = pd.read_csv("kmnist.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0,0,0,0,0,0,0,0,0,21,...,47,8,0,0,0,0,0,0,0,9
1,0,0,0,0,0,0,241,252,51,0,...,254,247,37,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0,0,0,0,0,0,0,...,219,232,61,0,0,0,0,0,0,2
4,0,0,0,0,0,0,64,221,255,125,...,255,115,0,0,0,0,0,0,0,1


In [9]:
param_grid = {
    'max_depth': [5, 10, 15, 20], 
    'min_samples_split': [2, 5, 7, 9, 10]
}
x = data.drop("label", axis = 1)
y = data["label"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y)

clf = HyperParamClassifier(param_grid, random_state=42, voting='hard')
clf.fit(x_train, y_train, n_estimators=5)

y_pred = clf.predict(x_test)
y_proba = clf.predict_proba(x_test)

In [10]:
print(f"{100*accuracy_score(y_test, y_pred):.2f}%")

74.14%
