## Ejercicio de hiperparametrización (*breast cancer*)

Pasos a seguir:
1. Carga el *dataset* [breast_cancer de `sklearn`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)
2. Prueba al menos 5 modelos diferentes de clasificación y aplica un GridSearchCV mediante *Pipelines*. Aplica también un RandomizedSearchCV.
3. Extrae conclusiones.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

In [None]:
data.keys()

In [None]:
print(data['DESCR'])

In [None]:
data['target_names']

In [None]:
df = pd.DataFrame(data['data'], columns  = data['feature_names'])
df['target'] = data['target']
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True)

In [None]:
X = df.drop(columns='target')
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10, stratify=y) # strtify para conservar los mismos balances de la target.

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
np.arange(5,21,5)

In [None]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ('classifier', RandomForestClassifier())
])

logistic_params = {
    'selectkbest__k': np.arange(5,21,5),
    'classifier': [LogisticRegression(max_iter=1000, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2']
}

random_forest_params = {
    'selectkbest__k': np.arange(5,21,5),
    'classifier': [RandomForestClassifier()],
    'classifier__max_features': [1,3,5],
    'classifier__max_depth': [1,3,5]
}

svm_param = {
    'selectkbest__k': np.arange(5,21,5),
    'classifier': [SVC()],
    'classifier__C': [0.1, 0.5, 1, 10, 100],
}

knn_param = {
    'selectkbest__k': np.arange(5,21,5),
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': [3,9,15],
}


gb_param = {
    'selectkbest__k': np.arange(5,21,5),
    'classifier': [GradientBoostingClassifier()],
    'classifier__max_depth': [1,3,5],
}
search_space = [
    logistic_params,
    random_forest_params,
    svm_param,
    knn_param,
    gb_param
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 3,
                  scoring="accuracy")


In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
y_pred = clf.best_estimator_.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
clf.best_estimator_.fit(X_train, y_train)

In [None]:
y_pred = clf.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ('classifier', RandomForestClassifier())
])

logistic_params = {
    'selectkbest__k': np.arange(5,25,2),
    'classifier': [LogisticRegression(max_iter=1000, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2']
}

random_forest_params = {
    'selectkbest__k': np.arange(5,25,2),
    'classifier': [RandomForestClassifier()],
    'classifier__max_features': [1,3,5],
    'classifier__max_depth': [1,3,5]
}

svm_param = {
    'selectkbest__k': np.arange(5,25,2),
    'classifier': [SVC()],
    'classifier__C': [0.1, 0.5, 1, 10, 100],
}

knn_param = {
    'selectkbest__k': np.arange(5,25,2),
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': [3,9,15],
}


gb_param = {
    'selectkbest__k': np.arange(5,25,2),
    'classifier': [GradientBoostingClassifier()],
    'classifier__max_depth': [1,3,5],
}
search_space = [
    logistic_params,
    random_forest_params,
    svm_param,
    knn_param,
    gb_param
]

clf_randomcv = RandomizedSearchCV(estimator = pipe,
                  param_distributions  = search_space,
                  cv = 3,
                  scoring="accuracy")


In [None]:
clf_randomcv.fit(X_train, y_train)

In [None]:
clf_randomcv.best_estimator_

In [None]:
clf_randomcv.best_params_

In [None]:
clf_randomcv.best_score_