In [1]:
import pandas as pd 
import numpy as np

In [2]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import mode

In [13]:
class HyperParamClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, param_grid):
        self.param_grid = param_grid
        self.model = None  # Placeholder for the trained model



    def fit(self, data, target_column, n_estimators=5, test_size=0.2, random_state=42):
        # Step 1: Split dataset into features (X) and target (y)
        X = data.drop(columns=[target_column])
        y = data[target_column]

        # Step 2: Split into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )

        # Step 3: Identify categorical columns
        self.categorical_cols_ = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

        # Step 4: Fit One-Hot Encoder on training data only
        if self.categorical_cols_:
            self.encoder_ = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
            X_train_encoded = self.encoder_.fit_transform(X_train[self.categorical_cols_])
            
            # Convert to DataFrame
            encoded_feature_names = self.encoder_.get_feature_names_out(self.categorical_cols_)
            X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)

            # Drop categorical columns and merge encoded features
            X_train = X_train.drop(columns=self.categorical_cols_).reset_index(drop=True)
            X_train_encoded_df = X_train_encoded_df.reset_index(drop=True)
            X_train = pd.concat([X_train, X_train_encoded_df], axis=1)

        # Step 5: Transform X_test using the SAME encoder
        if self.categorical_cols_:
            X_test_encoded = self.encoder_.transform(X_test[self.categorical_cols_])
            X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)

            # Drop categorical columns and merge encoded features
            X_test = X_test.drop(columns=self.categorical_cols_, errors="ignore").reset_index(drop=True)
            X_test_encoded_df = X_test_encoded_df.reset_index(drop=True)
            X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

        # Step 6: Ensure X_train and X_test have the SAME columns (avoid shape mismatch)
        missing_cols = set(X_train.columns) - set(X_test.columns)
        for col in missing_cols:
            X_test[col] = 0  # Add missing columns in X_test

        # Step 7: Fill any remaining NaN values (important to avoid errors)
        X_train.fillna(0, inplace=True)
        X_test.fillna(0, inplace=True)

        # Reorder columns in X_test to match X_train
        X_test = X_test[X_train.columns]

        # Step 8: Generate hyperparameter combinations and train multiple Decision Trees
        all_param_combos = list(ParameterGrid(self.param_grid))
        np.random.shuffle(all_param_combos)
        chosen_combos = all_param_combos[:n_estimators]

        self.estimators_ = []
        for params in chosen_combos:
            model = DecisionTreeClassifier(**params)
            model.fit(X_train, y_train)
            self.estimators_.append(model)

        # Store test data for future evaluation
        self.X_test_ = X_test
        self.y_test_ = y_test

        return self






    def predict(self, X):
    
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        # ✅ Check if categorical columns exist before encoding
        existing_categorical_cols = [col for col in self.categorical_cols_ if col in X.columns]

        if existing_categorical_cols:
            # Apply One-Hot Encoding only if categorical columns still exist
            X_encoded = self.encoder_.transform(X[existing_categorical_cols])
            encoded_feature_names = self.encoder_.get_feature_names_out(existing_categorical_cols)
            X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names, index=X.index)

            # Drop original categorical columns and merge encoded features
            X = X.drop(columns=existing_categorical_cols, errors="ignore").reset_index(drop=True)
            X_encoded_df = X_encoded_df.reset_index(drop=True)
            X = pd.concat([X, X_encoded_df], axis=1)

        # ✅ Ensure X has the same features as during training
        missing_cols = set(self.estimators_[0].feature_names_in_) - set(X.columns)
        for col in missing_cols:
            X[col] = 0  # Add missing columns with 0 values

        # Reorder columns to match training data
        X = X[self.estimators_[0].feature_names_in_]

        # ✅ Get predictions from all models
        predictions = np.array([estimator.predict(X) for estimator in self.estimators_])

        # ✅ Majority voting with correct shape
        final_predictions = mode(predictions, axis=0).mode
        return np.ravel(final_predictions)





    def predict_proba(self, X):
        # Convert X to a DataFrame if it's not already one
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        # Apply One-Hot Encoding to categorical columns (same as in fit)
        if self.categorical_cols_:
            X_encoded = self.encoder_.transform(X[self.categorical_cols_])
            
            # Convert to DataFrame
            encoded_feature_names = self.encoder_.get_feature_names_out(self.categorical_cols_)
            X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names, index=X.index)

            # Drop original categorical columns and concatenate encoded columns
            X = X.drop(columns=self.categorical_cols_).reset_index(drop=True)
            X_encoded_df = X_encoded_df.reset_index(drop=True)
            X = pd.concat([X, X_encoded_df], axis=1)

        # Get predicted probabilities from all trained models
        probas = np.array([estimator.predict_proba(X) for estimator in self.estimators_])

        # Average the predicted probabilities across all models
        final_probas = np.mean(probas, axis=0)
        
        return final_probas

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
# Load dataset (with categorical columns)
data = pd.read_csv("titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score

# ✅ Define hyperparameter grid
param_grid = {
    'max_depth': [5, 10], 
    'min_samples_split': [2, 5]
}

# ✅ Initialize classifier
clf = HyperParamClassifier(param_grid)

# ✅ Train the classifier with One-Hot Encoding applied
clf.fit(data, target_column='Survived', n_estimators=3)

# ✅ Compute ensemble accuracy using `clf.predict()`
y_pred_ensemble = clf.predict(clf.X_test_)  # Use stored test data
accuracy_ensemble = accuracy_score(clf.y_test_, y_pred_ensemble)

# ✅ Print results
print("\n🎯 Model Performance 🎯")
print(f"✅ Accuracy on Test Set (Majority Voting): {accuracy_ensemble:.4f}")

# ✅ Print hyperparameters & accuracy of each trained estimator
print("\n🔹 Hyperparameters & Accuracy for Each Estimator:")
for i, model in enumerate(clf.estimators_):
    print(f"\n🔹 Model {i+1} hyperparameters: {model.get_params()}")

    # ✅ Ensure only existing categorical columns are used
    existing_categorical_cols = [col for col in clf.categorical_cols_ if col in clf.X_test_.columns]

    if existing_categorical_cols:
        X_test_transformed = pd.DataFrame(
            clf.encoder_.transform(clf.X_test_[existing_categorical_cols]),
            columns=clf.encoder_.get_feature_names_out(existing_categorical_cols),
            index=clf.X_test_.index
        )

        # Drop original categorical columns and merge encoded features
        X_test_final = clf.X_test_.drop(columns=existing_categorical_cols, errors="ignore").reset_index(drop=True)
        X_test_final = pd.concat([X_test_final, X_test_transformed.reset_index(drop=True)], axis=1)
    else:
        X_test_final = clf.X_test_

    # ✅ Make predictions for **this individual model**
    y_pred = model.predict(X_test_final)

    # ✅ Compute accuracy for the **individual model**
    accuracy = accuracy_score(clf.y_test_, y_pred)

    print(f"📌 Model {i+1} Accuracy: {accuracy:.4f}")



🎯 Model Performance 🎯
✅ Accuracy on Test Set (Majority Voting): 0.8212

🔹 Hyperparameters & Accuracy for Each Estimator:

🔹 Model 1 hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
📌 Model 1 Accuracy: 0.8212

🔹 Model 2 hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
📌 Model 2 Accuracy: 0.8045

🔹 Model 3 hyperparameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_dec