Implemented basic voting (hard voting) <br>
fixed the issue with predict method


In [1]:
import pandas as pd 
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import mode

In [2]:
class HyperParamClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, param_grid, random_state=None):
        
        self.param_grid = param_grid
        self.random_state = random_state
        self.estimators_ = []  # to store the trained models

    def fit(self, X, y, n_estimators=5):
        
        # Set random seed if provided
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        # Generate all possible hyperparameter combinations
        all_param_combos = list(ParameterGrid(self.param_grid))
        
        # If there are fewer combinations than needed, sample with replacement;
        # otherwise, shuffle and take the first n_estimators.
        if len(all_param_combos) < n_estimators:
            chosen_combos = [all_param_combos[i] for i in 
                             np.random.randint(0, len(all_param_combos), size=n_estimators)]
        else:
            np.random.shuffle(all_param_combos)
            chosen_combos = all_param_combos[:n_estimators]
        
        self.estimators_ = []
        for params in chosen_combos:
            model = DecisionTreeClassifier(**params)
            model.fit(X, y)
            self.estimators_.append(model)
        
        return self

    def predict(self, X):
        
        # Get predictions from each estimator
        predictions = np.array([estimator.predict(X) for estimator in self.estimators_])
        # Majority vote (mode) along the ensemble axis (axis=0)
        majority_votes = mode(predictions, axis=0).mode
        return majority_votes

    def predict_proba(self, X):
        
        # Get probability predictions from each estimator
        probas = np.array([estimator.predict_proba(X) for estimator in self.estimators_])
        # Average the probabilities over all models
        avg_probas = np.mean(probas, axis=0)
        return avg_probas


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
# Load dataset (with categorical columns)
data = pd.read_csv("kmnist.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0,0,0,0,0,0,0,0,0,21,...,47,8,0,0,0,0,0,0,0,9
1,0,0,0,0,0,0,241,252,51,0,...,254,247,37,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0,0,0,0,0,0,0,...,219,232,61,0,0,0,0,0,0,2
4,0,0,0,0,0,0,64,221,255,125,...,255,115,0,0,0,0,0,0,0,1


In [5]:
param_grid = {
    'max_depth': [5, 10, 15, 20], 
    'min_samples_split': [2, 5, 7, 9, 10]
}
x = data.drop("label", axis = 1)
y = data["label"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y)

clf = HyperParamClassifier(param_grid, random_state=42)
clf.fit(x_train, y_train, n_estimators=3)

y_pred = clf.predict(x_test)
y_proba = clf.predict_proba(x_test)

In [6]:
print(f"{100*accuracy_score(y_test, y_pred):.2f}%")

76.03%


In [7]:
# import numpy as np
# import matplotlib.pyplot as plt
# from scipy.stats import mode

# # Simulate predictions from 3 estimators for 10 test samples
# n_estimators = 3
# n_samples = 10
# # For demonstration, let's assume a binary classification problem (classes 0 and 1)
# np.random.seed(42)
# predictions = np.random.randint(0, 2, size=(n_estimators, n_samples))
# print("Predictions from each estimator:\n", predictions)

# # Compute majority vote (mode) along axis=0
# majority_votes = mode(predictions, axis=0).mode.flatten()
# print("Ensemble's majority votes:\n", majority_votes)

# # Visualize the predictions for a specific sample (e.g., sample index 3)
# sample_index = 3
# sample_preds = predictions[:, sample_index]

# # Get counts for each predicted class for this sample
# classes, counts = np.unique(sample_preds, return_counts=True)

# # Create a bar chart showing the votes
# plt.figure(figsize=(6,4))
# plt.bar(classes, counts, color='skyblue', edgecolor='black')
# plt.xlabel("Predicted Class")
# plt.ylabel("Vote Count")
# plt.title(f"Votes for Test Sample {sample_index}\nMajority Vote: {majority_votes[sample_index]}")
# plt.xticks(classes)
# plt.show()


In [8]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # Create a small dataset
# data = pd.DataFrame({
#     'X1': [1, 2, 3, 4],
#     'X2': [2, 3, 4, 5]
# })
# y = np.array([0, 0, 1, 1])

# print("Original Data:")
# print(data)
# print("Labels:", y)

# # Number of bootstrap samples (models) we want to generate
# n_bootstrap = 3
# n_samples = len(data)

# bootstrap_samples = []

# # Generate bootstrap samples
# for i in range(n_bootstrap):
#     indices = np.random.choice(n_samples, size=n_samples, replace=True)
#     sample = data.iloc[indices].reset_index(drop=True)
#     sample_y = y[indices]
#     bootstrap_samples.append((indices, sample, sample_y))
#     print(f"\nBootstrap sample {i+1}:")
#     print("Indices:", indices)
#     print("Sample:\n", sample)
#     print("Labels:", sample_y)

# # For demonstration, let's simulate predictions from each model for 10 test samples.
# # Suppose each model returns a prediction (0 or 1) for each test sample.
# # Here, we simulate by randomly generating predictions.
# n_test = 10
# np.random.seed(42)
# # Each model makes predictions for n_test samples; shape: (n_bootstrap, n_test)
# predictions = np.random.randint(0, 2, size=(n_bootstrap, n_test))
# print("\nPredictions from each model:")
# print(predictions)

# # Use majority voting (mode) for each test sample
# from scipy.stats import mode
# ensemble_predictions = mode(predictions, axis=0).mode.flatten()

# print("\nEnsemble predictions (by majority vote):")
# print(ensemble_predictions)

# # Visualize the predictions for one test sample, say sample index 3
# sample_index = 3
# sample_preds = predictions[:, sample_index]
# classes, counts = np.unique(sample_preds, return_counts=True)

# plt.figure(figsize=(6,4))
# plt.bar(classes, counts, color='skyblue', edgecolor='black')
# plt.xlabel("Predicted Class")
# plt.ylabel("Vote Count")
# plt.title(f"Votes for Test Sample {sample_index}\nMajority Vote: {ensemble_predictions[sample_index]}")
# plt.xticks(classes)
# plt.show()
