# Comparação de ensembles - Dataset: Spambase

[Dataset (OpenML)](https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfClasses=%3D_2&id=44)

O dataset é de **classificação binária**, portanto conta com somente **um valor de intercept** e o número de valores de **coef** é igual ao número de features. Essa informação foi retirada da [documentação do scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) da classe LogisticRegression.



## Carregamento do dataset

In [1]:
from sklearn.datasets import fetch_openml

dataset = fetch_openml(data_id=44)

## Metadados do dataset

In [2]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
from pandas import DataFrame, Series

features: DataFrame = dataset['data']
target: Series = dataset['target']
features.shape, target.shape

((4601, 57), (4601,))

In [4]:
import pandas

X, y = dataset.data, pandas.to_numeric(dataset.target)

## Configurar seed e número de jobs em paralelo

In [5]:
import random

seed = 42
max_parallel_jobs = 10
random.seed(seed)
run_grid_search = True

## Otimizar classificadores

Para evitar uma **explosão de combinações** de parâmetros por combinar os classificadores e ter que esperar horas para otimização de parâmetros, optei por usar o grid search para encontrar a melhor configuração para **cada classificador isolado** e por fim unificar a melhor configuração de cada na construção do ensemble.

In [6]:
import warnings

warnings.filterwarnings('ignore')

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Melhor: {'max_depth': 7, 'max_leaf_nodes': 21, 'min_samples_leaf': 11} 0.903497379974508
grid_params = {
    'max_depth': list(range(1, 20, 2)),
    'min_samples_leaf': list(range(1, 50, 5)),
    'max_leaf_nodes': list(range(1, 50, 5)),
}

if run_grid_search:
    tree_classifier = DecisionTreeClassifier(random_state=seed)
    grid_search = GridSearchCV(tree_classifier, param_grid=grid_params, n_jobs=max_parallel_jobs)
    grid_search.fit(X, y)
    cross_val = mean(cross_val_score(grid_search, X, y))
    print(cross_val, grid_search.best_params_, grid_search.best_score_)

0.8969721002690838 {'max_depth': 7, 'max_leaf_nodes': 21, 'min_samples_leaf': 11} 0.903497379974508


### KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Melhor: {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'} 0.8189581267997923
grid_params = {
    'n_neighbors': list(range(1, 30, 2)),
    'p': [None, 1, 2],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['minkowski'],
}

if run_grid_search:
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid=grid_params, n_jobs=max_parallel_jobs)
    grid_search.fit(X, y)
    cross_val = mean(cross_val_score(grid_search, X, y))
    print(cross_val, grid_search.best_params_, grid_search.best_score_)

0.8115670584902988 {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'} 0.8189581267997923


### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

# Melhor: {'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.1} 0.9141486097342207
grid_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'tol': [1 / 10 ** i for i in range(1, 11, 2)],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [i for i in range(100, 751, 200)],
}

if run_grid_search:
    logistic_regression = LogisticRegression(random_state=seed)
    grid_search = GridSearchCV(logistic_regression, param_grid=grid_params, n_jobs=max_parallel_jobs)
    grid_search.fit(X, y)
    cross_val = mean(cross_val_score(grid_search, X, y))
    print(cross_val, grid_search.best_params_, grid_search.best_score_)

0.9150179389132795 {'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.1} 0.9141486097342207


### Base ensembles

In [10]:
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB

tree_args = {'max_depth': 7, 'max_leaf_nodes': 21, 'min_samples_leaf': 11}
knn_args = {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
logistic_args = {'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.1}

classifiers = [
    ('knn', KNeighborsClassifier(**knn_args)),
    ('naive_bayes', GaussianNB()),
    ('perceptron', Perceptron()),
    ('logistic', LogisticRegression(**logistic_args, random_state=seed)),
    ('decision', DecisionTreeClassifier(**tree_args, random_state=seed)),
]

### Voting classifier

In [11]:
from sklearn.ensemble import VotingClassifier

grid_params = {
    'voting': ['soft', 'hard'],
}

if run_grid_search:
    voting = VotingClassifier(classifiers.copy())
    grid_search = GridSearchCV(voting, param_grid=grid_params, n_jobs=max_parallel_jobs)
    grid_search.fit(X, y)
    cross_val = mean(cross_val_score(grid_search, X, y))
    print(cross_val, grid_search.best_params_, grid_search.best_score_)

0.9178430817164707 {'voting': 'hard'} 0.9178430817164707


### Stacking classifier

In [12]:
from sklearn.ensemble import StackingClassifier

# Melhor: {'cv': 10, 'passthrough': True, 'stack_method': 'predict'} 0.9232757399801728
grid_params = {
    'cv': list(range(1, 11, 1)),
    'stack_method': ['auto', 'predict_proba', 'decision_function', 'predict'],
    'passthrough': [False, True],
}

if run_grid_search:
    stacking = StackingClassifier(classifiers.copy())
    grid_search = GridSearchCV(stacking, param_grid=grid_params, n_jobs=max_parallel_jobs)
    grid_search.fit(X, y)
    cross_val = mean(cross_val_score(grid_search, X, y))
    print(cross_val, grid_search.best_params_, grid_search.best_score_)

0.9193631685785771 {'cv': 10, 'passthrough': True, 'stack_method': 'predict'} 0.9232757399801728


### Executar validação cruzada

In [14]:
from sklearn.ensemble import VotingClassifier, StackingClassifier

voting = VotingClassifier(classifiers.copy())
stacking_default = StackingClassifier(classifiers.copy())
stacking_optimized = StackingClassifier(classifiers.copy(), cv=10, passthrough=True, stack_method='predict')

voting_cross_val = mean(cross_val_score(voting, X, y))
stacking_default_cross_val = mean(cross_val_score(stacking_default, X, y))
stacking_opt_cross_val = mean(cross_val_score(stacking_optimized, X, y))

data = {
    'Voting': [voting_cross_val],
    'Stacking (default)': [stacking_default_cross_val],
    'Stacking (otimizado)': [stacking_opt_cross_val],
}
columns = ['Cross validation']
DataFrame.from_dict(data, orient='index', columns=columns)

Unnamed: 0,Cross validation
Voting,0.917843
Stacking (default),0.720503
Stacking (otimizado),0.921971
