In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.utils import resample

In [2]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler


df = pd.read_csv("your_data.csv")

minmax_scaler = MinMaxScaler()    # For height
robust_scaler = RobustScaler()    # For hemoglobin

# Normalize each feature differently
df['height_normalized'] = minmax_scaler.fit_transform(
    df[['height(cm)']])
df['hemoglobin_normalized'] = robust_scaler.fit_transform(
    df[['hemoglobin']])

df['serum creatinine_normalized'] = robust_scaler.fit_transform(
    df[['serum creatinine']])
df['waist_normalized'] = robust_scaler.fit_transform(df[['waist(cm)']])

df['height_hemoglobin'] = (
    df['height_normalized'] + df['hemoglobin_normalized']) / 2
df['serum creatinine_waist'] = (
    df['serum creatinine_normalized'] + df['waist_normalized']) / 2


X = df[['height_hemoglobin', 'serum creatinine_normalized', 'HDL']]
y = df['smoking']

x_train, x_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=42)

In [3]:
class Bagging(BaseEstimator, ClassifierMixin):
    def __init__(self, base_model=DecisionTreeClassifier(), n_estimators=100, max_samples=0.8, max_features=0.5):
        self.base_model = base_model
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features

    def bootstrap(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, int(
            self.max_samples * n_samples), replace=True)
        return X.iloc[indices], y.iloc[indices]

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.models_ = []
        for _ in range(self.n_estimators):
            X_sample, y_sample = self.bootstrap(X, y)
            model = self.base_model
            model.fit(X_sample, y_sample)
            self.models_.append(model)
        return self

    def predict(self, X):
        predictions = np.zeros((self.n_estimators, X.shape[0]))
        for i, model in enumerate(self.models_):
            predictions[i] = model.predict(X)

        # Majority vote
        final_predictions = np.apply_along_axis(
            lambda x: np.bincount(x.astype(int)).argmax(), axis=0, arr=predictions
        )
        return final_predictions

    def get_params(self, deep=True):
        """Return hyperparameters for tuning."""
        return {"base_model": self.base_model, "n_estimators": self.n_estimators, "max_samples": self.max_samples, "max_features": self.max_features}

    def set_params(self, **params):
        """Set hyperparameters for tuning."""
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [21]:
class Boosting (BaseEstimator, ClassifierMixin):
    def __init__(self, model, n):
        self.model = model
        self.n = n
        self.models = []
        self.model_weights = []

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.models = []
        self.model_weights = []
        sample_weights = np.ones(len(y)) / len(y)

        for _ in range(self.n):
            model = self.model()
            model.fit(X, y, sample_weight=sample_weights)
            predictions = model.predict(X)

            error = np.sum(sample_weights * (predictions != y)
                           ) / np.sum(sample_weights)

            if error == 0:
                self.models.append(model)
                self.model_weights.append(1)
                break

            model_weight = 0.5 * np.log((1 - error) / error)
            self.models.append(model)
            self.model_weights.append(model_weight)

            sample_weights = sample_weights * \
                np.exp(-model_weight * y * predictions)
            sample_weights /= np.sum(sample_weights)  # Normalize the weights
        return self

    def predict(self, X):
        weighted_predictions = np.zeros(len(X))
        for model, weight in zip(self.models, self.model_weights):
            weighted_predictions += weight * model.predict(X)
        # Return the sign of the weighted sum
        return np.sign(weighted_predictions)

    def get_params(self, deep=True):
        """Return hyperparameters for tuning."""
        return {"model": self.model, "n": self.n}

    def set_params(self, **params):
        """Set hyperparameters for tuning."""
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [27]:
class RandomForest (BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=100, max_features='auto', random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []

    def _bootstrap_sample(self, X, y):
        return resample(X, y, random_state=self.random_state)

    def _select_features(self, X):
        if self.max_features == 'auto':
            max_features = int(np.sqrt(X.shape[1]))
        elif isinstance(self.max_features, int):
            max_features = self.max_features
        else:
            max_features = X.shape[1]

        features = np.random.choice(X.shape[1], max_features, replace=False)
        return features

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        np.random.seed(self.random_state)
        self.trees = []

        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(X, y)

            features = self._select_features(X_sample)

            tree = DecisionTreeClassifier(max_features=len(
                features), random_state=self.random_state)
            tree.fit(X_sample.iloc[:, features].values, y_sample)

            self.trees.append((tree, features))
        return self

    def predict(self, X):
        predictions = np.zeros((X.shape[0], self.n_estimators), dtype=int)

        for i, (tree, features) in enumerate(self.trees):
            # Fix: Use .iloc if X is a DataFrame
            model_predictions = tree.predict(X.iloc[:, features].values) if hasattr(
                X, 'iloc') else tree.predict(X[:, features])

            if model_predictions.dtype != int:
                model_predictions = np.round(model_predictions).astype(int)

            predictions[:, i] = model_predictions

        # Use majority voting
        majority_vote = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)
        return majority_vote

    def get_params(self, deep=True):
        """Return hyperparameters for tuning."""
        return {"n_estimators": self.n_estimators, "max_features": self.max_features, "random_state": self.random_state}

    def set_params(self, **params):
        """Set hyperparameters for tuning."""
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [6]:
bagging_model = Bagging(
    base_model=DecisionTreeClassifier(),
    n_estimators=50,
    max_samples=0.7,
    max_features=0.6
)

boosting_model = Boosting(DecisionTreeClassifier, n=50)

rf_classifier = RandomForest(
    n_estimators=100, max_features='sqrt', random_state=42)

bagging_model.fit(x_train, y_train)
boosting_model.fit(x_train, y_train)
bagging_predictions = bagging_model.predict(x_test)
boosting_predictions = boosting_model.predict(x_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print(f"Bagging Model Accuracy: {bagging_accuracy:.4f}")
print(f"Boosting Model Accuracy: {boosting_accuracy:.4f}")
rf_classifier.fit(x_train, y_train)
rf_predictions = rf_classifier.predict(x_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

Bagging Model Accuracy: 0.6490
Boosting Model Accuracy: 0.6533
Random Forest Accuracy: 0.6526


In [9]:
# Bagging hyperparameter tuning
bagging_param_grid = {
    'n': [5, 10, 20],
    'base_model': [DecisionTreeClassifier(max_depth=d) for d in [5, 10, None]],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}

bagging_random_search = RandomizedSearchCV(
    estimator=Bagging(),
    param_distributions=bagging_param_grid,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
bagging_random_search.fit(x_train, y_train)

bagging_random_search.fit(x_train, y_train)
best_bagging_model = bagging_random_search.best_estimator_
bagging_best_accuracy = bagging_random_search.best_score_

In [26]:
boosting_param_grid = {
    'n': [10, 20, 30]  # Number of boosting rounds
}

boosting_grid_search = GridSearchCV(
    estimator=Boosting(DecisionTreeClassifier, n=50),
    param_grid=boosting_param_grid,
    scoring='accuracy',
    cv=5,
)
boosting_grid_search.fit(x_train, y_train)
best_boosting_model = boosting_grid_search.best_estimator_
boosting_best_accuracy = boosting_grid_search.best_score_

In [30]:
rf_param_grid = {
    'n_estimators': [5, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [2, 10, 20],
}

rf_random_search = RandomizedSearchCV(
    estimator=RandomForest(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
)
rf_random_search.fit(x_train, y_train)
best_rf_model = rf_random_search.best_estimator_
rf_best_accuracy = rf_random_search.best_score_

  _data = np.array(data, dtype=dtype, copy=copy,


In [31]:
print(f"Best Bagging Model Accuracy: {bagging_best_accuracy:.4f}")
print(f"Best Boosting Model Accuracy: {boosting_best_accuracy:.4f}")
print(f"Best Random Forest Accuracy: {rf_best_accuracy:.4f}")

Best Bagging Model Accuracy: 0.7211
Best Boosting Model Accuracy: 0.6569
Best Random Forest Accuracy: 0.6555


In [32]:
# Evaluate the best models on the test set
bagging_final_predictions = best_bagging_model.predict(x_test)
boosting_final_predictions = best_boosting_model.predict(x_test)
rf_final_predictions = best_rf_model.predict(x_test)

# Compute accuracies
bagging_final_accuracy = accuracy_score(y_test, bagging_final_predictions)
boosting_final_accuracy = accuracy_score(y_test, boosting_final_predictions)
rf_final_accuracy = accuracy_score(y_test, rf_final_predictions)

# Display final accuracies
print(f"Final Bagging Model Test Accuracy: {bagging_final_accuracy:.4f}")
print(f"Final Boosting Model Test Accuracy: {boosting_final_accuracy:.4f}")
print(f"Final Random Forest Test Accuracy: {rf_final_accuracy:.4f}")

Final Bagging Model Test Accuracy: 0.7217
Final Boosting Model Test Accuracy: 0.6603
Final Random Forest Test Accuracy: 0.6526


In [33]:
if bagging_final_accuracy >= boosting_final_accuracy and bagging_final_accuracy >= rf_final_accuracy:
    best_model = best_bagging_model
    print("Selected Model: Bagging")
elif boosting_final_accuracy >= bagging_final_accuracy and boosting_final_accuracy >= rf_final_accuracy:
    best_model = best_boosting_model
    print("Selected Model: Boosting")
else:
    best_model = best_rf_model
    print("Selected Model: Random Forest")

# Save or use the best model
print(f"The best model is {best_model} with test accuracy of {
      max(bagging_final_accuracy, boosting_final_accuracy, rf_final_accuracy):.4f}")

Selected Model: Bagging
The best model is Bagging(base_model=DecisionTreeClassifier(max_depth=5), max_features=0.9,
        max_samples=0.7) with test accuracy of 0.7217
