In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
# create a dataset
X, y = make_classification(n_samples=2000,
                           n_features=10,
                           n_informative=8,
                           n_redundant=2, random_state=42)

In [None]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
gnb = GaussianNB()
lr = LogisticRegression()
rfc = RandomForestClassifier()

In [None]:
estimators = [gnb, lr, rfc]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

In [None]:
[estimator.score(X_test, y_test) for estimator in estimators]

In [None]:
# create a voting classifier
estimators=[('gnb', gnb), ('lr', lr), ('rfc', rfc)]
voting_clf = VotingClassifier(estimators=estimators, voting='hard')
voting_clf.fit(X_train, y_train)
cross_voting_score = cross_val_score(voting_clf, X_train, y_train, cv=5).mean()
print(f'Voting Classifier Cross Validation Score: {cross_voting_score}')

In [None]:
# tuning
param_grid = {
    'voting': ['hard', 'soft'],
    'weights': [[1, 1, 1], [2, 1, 1], [1, 2, 1], [1, 1, 2]]
}



grid_search = GridSearchCV(estimator=voting_clf,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=0,
                           scoring='accuracy',
                           return_train_score=False)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Model: {best_model}')
print(f'Best Score: {best_score}')

In [None]:
# remove poor performing model LogisticRegression
voting_clf.set_params(lr=None)
del voting_clf.estimators_[1]
del voting_clf.estimators[1]
voting_clf.estimators_

In [None]:
voting_clf.estimators

In [None]:
# tuning
param_grid = {
    'voting': ['hard', 'soft'],
    'weights': [[1, 1], [2, 1], [1, 2]]
}

grid_search = GridSearchCV(estimator=voting_clf,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=0,
                           scoring='accuracy',
                           return_train_score=False)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Model: {best_model}')
print(f'Best Score: {best_score}')

- Best score improved after removing LogisticRegression ->  **0.87** ( it's greater than  0.841 provided by RandomForestClassifier)
- Average score of all models [0.75, 0.7, 0.842] = **0.765**
- Average score of best models [0.75, 0.842] = **0.7958**
- So % of accuracy improving is (0.7958 - 0.765) / 0.765 * 100 = **4.04%** It's a good improvement!