# **Intro to model selection**

1 - Utilizando os dados "breast cancer" faça

1.1 - Criar um pipeline com todos os componentes necessários para a sua solução de ML (e.g. pre-processing, transformação de dados, normalização, redução de dimensionalidade, classificação)

1.2 - Faça um gridsearch para achar os melhores parâmetros dos componentes da sua solução de ML

1.3 - Estime, em cima da base de dados de teste, a acurácia final, usando a melhor combinação de hyperparâmetros possíveis


In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd

data = load_breast_cancer()
df= pd.DataFrame(data=data['data'], columns=data['feature_names'])


In [None]:
df.shape

(569, 30)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [None]:
df_targets = pd.Series(data=data['target'], name='benign')
df_targets.unique()

X = df.iloc[:, 1:30].values
y = df_targets.values

## Base de treinamento e base teste


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, shuffle=True,random_state=1, stratify=y)

## Pipeline

In [None]:
pipe = Pipeline([
        ('z-score', StandardScaler()),
        ('reduce_dim', PCA()),
        ('classify', KNeighborsClassifier(n_neighbors=2))])

In [None]:
pipe.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('z-score',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('classify',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)

In [None]:
from sklearn.metrics import accuracy_score

# Resultado training data
y_train_pred = pipe.predict(X_train)
a_dt_train=accuracy_score(y_train, y_train_pred)

# Resultado test data
y_test_pred = pipe.predict(X_test)
a_dt_test=accuracy_score(y_test, y_test_pred)

print("Training data accuracy is " +  repr(a_dt_train) + " and test data accuracy is " + repr(a_dt_test))


Training data accuracy is 0.978021978021978 and test data accuracy is 0.9649122807017544


## **Grid-search**

In [None]:
param_grid = {
    'reduce_dim__n_components': [1, 2, 3, 4,5,6,7,8,9,10],
    'classify__n_neighbors': [10, 12, 14, 16, 18, 20, 22, 24, 26, 28]
}

grid = GridSearchCV(pipe, cv=2, n_jobs=1, param_grid=param_grid, scoring='accuracy')

In [None]:
grid.fit(X_train, y_train)


GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('z-score',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('reduce_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('classify',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             me

In [None]:
print(grid.cv_results_)


{'mean_fit_time': array([0.00342059, 0.0021162 , 0.00199592, 0.00209963, 0.00217104,
       0.00211203, 0.00223255, 0.00204885, 0.00259125, 0.00214088,
       0.00203645, 0.00198317, 0.00203836, 0.00205982, 0.00209248,
       0.00208092, 0.00210679, 0.00209057, 0.00239527, 0.00238395,
       0.00277412, 0.00222933, 0.00221443, 0.00267458, 0.002352  ,
       0.00238681, 0.00250411, 0.00212526, 0.00206161, 0.00208509,
       0.00201249, 0.00241303, 0.00229514, 0.00228655, 0.00238371,
       0.00216591, 0.00211084, 0.00206792, 0.00211906, 0.00209987,
       0.00206614, 0.00197732, 0.00220954, 0.00211966, 0.00202119,
       0.00206518, 0.00248671, 0.00212979, 0.00211346, 0.00219285,
       0.00248837, 0.00204146, 0.00203598, 0.00192249, 0.00201845,
       0.00208616, 0.00207925, 0.00221443, 0.00222814, 0.0024029 ,
       0.00220525, 0.00223565, 0.00229704, 0.00212181, 0.00218284,
       0.00210989, 0.0021404 , 0.00225306, 0.00228477, 0.00214636,
       0.00210047, 0.00209832, 0.0020684 , 0

In [None]:
grid.cv_results_['mean_test_score']


array([0.91211647, 0.94725249, 0.92965067, 0.94066388, 0.94726215,
       0.94724283, 0.95385076, 0.95822707, 0.96042971, 0.96042971,
       0.91431911, 0.94066388, 0.93186297, 0.9384709 , 0.94725249,
       0.94723317, 0.9516288 , 0.94944547, 0.9516288 , 0.9538411 ,
       0.91431911, 0.94067355, 0.93624894, 0.93846124, 0.94286653,
       0.94064456, 0.94723317, 0.94064456, 0.9406349 , 0.94284721,
       0.91431911, 0.93407528, 0.93405595, 0.93845158, 0.93846124,
       0.93405595, 0.93624894, 0.93624894, 0.93624894, 0.93185331,
       0.91431911, 0.93627792, 0.92966033, 0.93186297, 0.93846124,
       0.93186297, 0.93185331, 0.93405595, 0.9362586 , 0.92966033,
       0.91652176, 0.9340946 , 0.93186297, 0.93186297, 0.93406562,
       0.93186297, 0.93405595, 0.93846124, 0.93406562, 0.93626826,
       0.9165121 , 0.9340946 , 0.93406562, 0.93186297, 0.93406562,
       0.93624894, 0.93185331, 0.93624894, 0.93845158, 0.93405595,
       0.91431911, 0.93190162, 0.92967965, 0.93187263, 0.93626

In [None]:
print(grid.best_score_)
print(grid.best_params_)

0.9604297086328155
{'classify__n_neighbors': 10, 'reduce_dim__n_components': 9}


In [None]:
clf = grid.best_estimator_


In [None]:

y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9736842105263158

In [None]:
y_train_pred = clf.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.978021978021978