<a href="https://colab.research.google.com/github/MariaLFreitas/Machine_learning/blob/main/L11_intro_to_model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Criação pipeline usando os  dados da base Breast cancer

importação de bibliotecas necessaria para fazer o pipeline

In [None]:
import numpy as np

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


Acessando a base breast cancer

In [None]:
bcancer = datasets.load_breast_cancer()

X = bcancer.data
y = bcancer.target

Separando a base, em base de treinamento e base de teste.



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1, stratify=y)

Criando o pipeline

In [None]:
pipe = Pipeline([
        ('normalizador', StandardScaler()),
        ('redutor_dim', PCA()),
        ('classificador', KNeighborsClassifier(n_neighbors=2))
])

Agora será treinado o pipeline

In [None]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('normalizador',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('redutor_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('classificador',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)

In [None]:
y_test_pred = pipe.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.9649122807017544

In [None]:
y_train_pred = pipe.predict(X_train)
accuracy_score(y_train,y_train_pred)

0.978021978021978

###Grid-search

In [None]:
param_grid = {
    'redutor_dim__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'classificador__n_neighbors': [ 10, 12, 14, 16, 18, 20, 22, 24, 26, 28]
}

grid = GridSearchCV(pipe, cv=2, n_jobs=1, param_grid= param_grid, scoring='accuracy' )

Testar os classificadores

In [None]:
grid.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('normalizador',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('redutor_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('classificador',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                    

Melhor score e melhores parametros

In [None]:
print(grid.best_score_)
print(grid.best_params_)

0.9604297086328155
{'classificador__n_neighbors': 10, 'redutor_dim__n_components': 8}


Criando um novo classificador com base nos parametros de estimador.

In [None]:
clf = grid.best_estimator_

y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9649122807017544

In [None]:
y_train_pred = clf.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9714285714285714