In [12]:
# import libraries
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [13]:
# create a dataset
X, y = make_classification(n_samples=2000,
                           n_features=10,
                           n_informative=8,
                           n_redundant=2, random_state=42)

In [14]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

cross_gnb_score = cross_val_score(gnb, X_train, y_train, cv=5).mean()

print(f'GaussianNB Cross Validation Score: {cross_gnb_score}')

GaussianNB Cross Validation Score: 0.7492857142857143


In [16]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

cross_lr_score = cross_val_score(lr, X_train, y_train, cv=5).mean()

print(f'Logistic Regression Cross Validation Score: {cross_lr_score}')

Logistic Regression Cross Validation Score: 0.7257142857142858


In [17]:
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)

cross_rfc_score = cross_val_score(rfc, X_train, y_train, cv=5).mean()

print(f'Random Forest Cross Validation Score: {cross_rfc_score}')

Random Forest Cross Validation Score: 0.875


In [18]:
# create a voting classifier
voting_clf = VotingClassifier(estimators=[('gnb', gnb), ('lr', lr), ('rfc', rfc)], voting='hard')

voting_clf.fit(X_train, y_train)

cross_voting_score = cross_val_score(voting_clf, X_train, y_train, cv=5).mean()

print(f'Voting Classifier Cross Validation Score: {cross_voting_score}')

Voting Classifier Cross Validation Score: 0.8164285714285715


In [19]:
# tuning 

param_grid = {
    'voting': ['hard', 'soft'],
    'weights': [[1, 1, 1], [2, 1, 1], [1, 2, 1], [1, 1, 2]]
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Model: {best_model}')
print(f'Best Score: {best_score}')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'voting': 'hard', 'weights': [1, 1, 2]}
Best Model: VotingClassifier(estimators=[('gnb', GaussianNB()),
                             ('lr', LogisticRegression()),
                             ('rfc', RandomForestClassifier())],
                 weights=[1, 1, 2])
Best Score: 0.837857142857143
