## Ejercicio hiperparametrización breast cancer de sklearn

1. Carga el dataset [breast_cancer de `sklearn`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)
2. Prueba al menos 5 modelos diferentes de clasificación y aplica un GridSearchCV mediante Pipelines. Aplica también un RandomizedSearchCV.
3. Conclusiones. Guarda el modelo final en un archivo con pickle.

In [253]:
#Importamos las librerías primeramente:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer #Para imputar missings
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest # basado en p-values
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(42)

In [254]:
cancer_data = load_breast_cancer()
df_cancer = pd.DataFrame(data=cancer_data.data, columns=cancer_data.feature_names)
df_cancer['Diagnosis'] = cancer_data.target # Esto para sacar los nombres de las columnas.
df_cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [255]:
df_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

#### Primeramente vamos a ir realizando los Pipelines y los param de los modelos.

In [256]:
RANDOM_STATE = 42

reg_log = Pipeline(steps = [
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression(random_state=RANDOM_STATE))
])

reg_log_param = {
    "imputer__strategy": ['mean', 'median'],
    "reglog__penalty": ['l2'],
    "reglog__C": np.logspace(0, 4, 10)
}

rand_forest = RandomForestClassifier(random_state=RANDOM_STATE) #En este caso no realizamos un Pipeline, el modelo está ya estandarizado.
rand_forest_param = {
    "n_estimators": [10, 100, 1000],
    "max_features": [1,2,3]
}

svm = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()), #Nos va a seleccionar features como si se tratase de un EDA.
    ("svm", SVC(random_state=RANDOM_STATE))
])


svm_param = {
    'selectkbest__k': [2, 3, 4], #Quiero que te quedes con 2,3 o 4 features.
    'svm__kernel': ['linear', 'rbf', 'sigmoid', 'poly'], #Quiero que el kernel sea uno de estos.
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__degree': [1,2,3,4],
    'svm__gamma': ['scale', 'auto']
}

gradient_boosting = GradientBoostingClassifier(random_state=RANDOM_STATE)
gradient_boosting_param = {
    "n_estimators": [10, 100, 1000],
    "max_features": [1,2,3]
}

dec_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
dec_tree_param = {
    "max_depth":[1,2,3],
    "max_features": [1,2,3]
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1, #Nivel de información que quiero que me de cuando estamos haciendo los features.
                         n_jobs = -1)

gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

gs_svm = GridSearchCV(svm,
                         svm_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

gs_gradientboosting = GridSearchCV(gradient_boosting,
                          gradient_boosting_param,
                          cv=10,
                          scoring='accuracy',
                          verbose=1,
                          n_jobs=-1)

gs_dec_tree = GridSearchCV(dec_tree,
                             dec_tree_param,
                             cv=10,
                             scoring='accuracy',
                             verbose=1,
                             n_jobs=-1)

grids = {"gs_reg_log": gs_reg_log,
        "gs_rand_forest": gs_rand_forest,
        "gs_svm": gs_svm,
        "gs_gradient_boosting":gs_gradientboosting,
        "gs_dec_tree":gs_dec_tree}


In [257]:
from sklearn.model_selection import train_test_split 
X = df_cancer.drop('Diagnosis',axis=1)
y = df_cancer['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [258]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 672 candidates, totalling 6720 fits
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 9 candidates, totalling 90 fits


#### Regresión Logística

In [259]:
print(gs_reg_log.best_score_)
print(gs_reg_log.best_params_)
print(gs_reg_log.best_estimator_)
print(gs_reg_log.best_estimator_['reglog']) #Debido a que hemos realizado un Pipeline

0.9735748792270531
{'imputer__strategy': 'mean', 'reglog__C': np.float64(1.0), 'reglog__penalty': 'l2'}
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('reglog',
                 LogisticRegression(C=np.float64(1.0), random_state=42))])
LogisticRegression(C=np.float64(1.0), random_state=42)


##### Random Forest

In [260]:
print(gs_rand_forest.best_score_)
print(gs_rand_forest.best_params_)
print(gs_rand_forest.best_estimator_)

0.9692270531400966
{'max_features': 2, 'n_estimators': 1000}
RandomForestClassifier(max_features=2, n_estimators=1000, random_state=42)


##### Support Vector Machine (SVM)

In [261]:
print(gs_svm.best_score_)
print(gs_svm.best_params_)
print(gs_svm.best_estimator_)
print(gs_svm.best_estimator_['svm'])

0.9493719806763286
{'selectkbest__k': 4, 'svm__C': 0.5, 'svm__degree': 1, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=4)),
                ('svm', SVC(C=0.5, degree=1, random_state=42))])
SVC(C=0.5, degree=1, random_state=42)


##### Gradient Boosting

In [262]:
print(gs_gradientboosting.best_score_)
print(gs_gradientboosting.best_params_)
print(gs_gradientboosting.best_estimator_)

0.9671014492753622
{'max_features': 1, 'n_estimators': 1000}
GradientBoostingClassifier(max_features=1, n_estimators=1000, random_state=42)


##### Decision Tree

In [263]:
print(gs_dec_tree.best_score_)
print(gs_dec_tree.best_params_)
print(gs_dec_tree.best_estimator_)

0.9252173913043478
{'max_depth': 3, 'max_features': 1}
DecisionTreeClassifier(max_depth=3, max_features=1, random_state=42)


#### Mejor modelo usando GridSearch

In [264]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
0,gs_reg_log,0.973575
1,gs_rand_forest,0.969227
3,gs_gradient_boosting,0.967101
2,gs_svm,0.949372
4,gs_dec_tree,0.925217


### Empleando RandomSearch

###### Vamos a realizar el KNN ya que es el algoritmo de clasificación que nos falta por usar (como ejemplo)

In [265]:
from scipy.stats import randint

knn = Pipeline(steps = [
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

knn_param = {
    "knn__n_neighbors":randint(1,10),
    "knn__weights":["uniform","distance"]
}

rand_knn = RandomizedSearchCV(knn,
                           knn_param,
                           n_iter = 50,
                           scoring='accuracy',
                           n_jobs=-1,
                           cv=10)


# execute search
result = rand_knn.fit(X_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9713526570048309
Best Hyperparameters: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}
Best Estimator: Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=7, weights='distance'))])


##### Nos queda guardar en un "pickle", el mejor modelo: **La Regresión Logística**

In [266]:
# El mejor modelo ha sido
best_model = gs_reg_log.best_estimator_
best_model.score(X_test, y_test)

0.9736842105263158

In [267]:
import pickle

filename = 'finished_model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [268]:
with open(filename, 'rb') as archivo_entrada:
    modelo_importado = pickle.load(archivo_entrada)

In [269]:
modelo_importado.score(X_test, y_test)*100

97.36842105263158

In [270]:
modelo_importado.predict(X_test)

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0])

In [271]:
modelo_importado

Con todo ello, guardamos el mejor modelo empleando para ello: "pickle"

El mejor modelo ha sido: **Regresión Logística**.