In [None]:
# Conjunto de datos
import pandas as pd
# https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
df = pd.read_csv('https://bit.ly/3gob0mX', header=None)

In [None]:
df.head(2)

In [None]:
# Predictoras y objetivo
X = df.loc[:, 2:].values
y = df.loc[:, 1].values

In [None]:
# Codificación de etiquetas
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_, le.transform(['M','B']) # clases y ejemplo

In [None]:
# Entrenamiento y pruebas
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=1)
X_train.shape, X_test.shape

In [None]:
# Primer pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(
                        )
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
print('Exactitud en test = %.3f' % pipe_lr.score(X_test, y_test))

In [None]:
# Validación cruzada en sklearn
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=...,
                         X= ..., y=...,
                         cv=10, n_jobs=1)
print('Puntajes de exactitud de validación cruzada : %s'%(scores))
print('Exactitud de validación cruzada : %.3f +/- %.3f'%(np.mean(scores),np.std(scores)))

In [None]:
# Curvas de aprendizaje
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', random_state=1))

train_sizes,train_scores,test_scores=learning_curve(estimator=...,
                                                        X=..., y=...,
                                                        train_sizes=np.linspace(0.1,1,10),
                                                        cv=10, n_jobs=1)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5,
         label='exactitud del entrenamiento')
plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean, color='green', marker='o', markersize=5,
         linestyle='--', label='exactitud de validación')
plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Número de muestras de entrenamiento')
plt.ylabel('Exactitud')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.05])
plt.show()

In [None]:
# Sobre/subajuste & curvas de validación
from sklearn.model_selection import validation_curve
param_range = [10**i for i in range(-3,3)]
train_scores,test_scores=validation_curve(estimator=pipe_lr,
                                          X=X_train, y=y_train,
                                          param_name='logisticregression__C',
                                          param_range=param_range,
                                          cv=10, n_jobs=10)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5,
         label='exactitud del entrenamiento')
plt.fill_between(param_range, train_mean+train_std, train_mean-train_std,
                 alpha=0.15, color='blue')

plt.plot(param_range, test_mean, color='green', marker='o', markersize=5,
         linestyle='--', label='exactitud de validación')
plt.fill_between(param_range, test_mean+test_std, test_mean-test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.xlabel('Parámetro C')
plt.ylabel('Exactitud')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.01])
plt.show()

In [None]:
# Ajuste de hiperparámetros con búsqueda de malla => grid search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1))

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5,
         label='exactitud del entrenamiento')
plt.fill_between(param_range, train_mean+train_std, train_mean-train_std,
                 alpha=0.15, color='blue')

plt.plot(param_range, test_mean, color='green', marker='o', markersize=5,
         linestyle='--', label='exactitud de validación')
plt.fill_between(param_range, test_mean+test_std, test_mean-test_std,
                 alpha=0.15, color='green')

param_range = [10**i for i in range(-4,5)]
param_grid_svc = [{'svc__C':param_range,'svc__kernel':['linear']},
                  {'svc__C':param_range,'svc__gamma':param_range,
                   'svc__kernel':['rbf']}]

gs = GridSearchCV(estimator=..., param_grid=...,
                  scoring='accuracy', cv=10, n_jobs=-1)

gs = gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# exactitud del mejor estimador
clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Exactitud en test : %.3f' % clf.score(X_test, y_test))

In [None]:
# Comparación de búsqueda aleatoria y de malla para estimar hiperparámetros
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py

In [None]:
# Selección de algoritmos con validación cruzada anidada
gs_svc = GridSearchCV(estimator=pipe_svc, param_grid=param_grid_svc,
                      scoring='accuracy', cv=2)

scores = cross_val_score(gs_svc, X_train, y_train,
                         scoring='accuracy', cv=5)

In [None]:
print('Exactitud NCV con SVC : %.3f +/- %.3f'%(np.mean(scores),np.std(scores)))

In [None]:
from sklearn.tree import DecisionTreeClassifier

gs_dtc = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                  param_grid=[{'max_depth':[1,2,3,4,5,6,7,None]}],
                  scoring='accuracy', cv=2)

scores = cross_val_score(gs_dtc, X_train, y_train,
                         scoring='accuracy', cv=5)

In [None]:
print('Exactitud de NCV con DT : %.3f +/- %.3f'%(np.mean(scores),np.std(scores)))

In [None]:
# Métricas de rendimiento
# Matriz de confusión
from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(...)
print(confmat)

In [None]:
confmat = np.array([[71,1],[2,40]])
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
  for j in range(confmat.shape[1]):
    ax.text(x=j, y=i, s=confmat[i,j],
            va='center', ha='center')
plt.xlabel('clase predicha')
plt.ylabel('clase verdadera')
plt.show()

In [None]:
# Con sklearn
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=confmat, display_labels=['No Enfermedad', 'Enfermedad'])
disp.plot(cmap=plt.cm.Blues)  # Puedes cambiar el mapa de color
plt.title('Matriz de Confusión')
plt.show()

In [None]:
# Precisión y sensibilidad
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score

print('Precisión : %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('   Recall : %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('       F1 : %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

In [None]:
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# Gráfica de ROC (Receiver Operating Characteristic, o Característica Operativa del Receptor)
from sklearn.metrics import roc_curve, auc
from numpy import interp

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(penalty='l2',random_state=1,
                                           C=100.0))

X_train2 = X_train[:, [4, 14]]

cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train))

In [None]:
fig = plt.figure(figsize=(7, 5))

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
  probas = pipe_lr.fit(X_train2[train],
                       y_train[train]).predict_proba(X_train2[test])
  fpr, tpr, threshold = roc_curve(y_train[test], probas[:,1], pos_label=1)
  mean_tpr += interp(mean_fpr, fpr, tpr)
  mean_tpr[0] = 0.0
  roc_auc = auc(fpr, tpr)
  plt.plot(fpr, tpr, label='ROC pliegue %d (área = %0.2f)' % (i+1, roc_auc))

plt.plot([0,1], [0,1], linestyle='--', color=[0.6,0.6,0.6],
         label='estimación aleatoria')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Media de ROC (área = %.2f)'%mean_auc, lw=2)
plt.plot([0,0,1], [0,1,1], linestyle=':', color='black',
         label='rendimiento perfecto')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('proporción de falsos positivos')
plt.ylabel('proporción de positivos verdaderos')
plt.legend(loc='lower right')
plt.show()