# Problema de Clasificación Binaria
##### Breast Cancer Wisconsin Dataset

In [1]:
# Manipulación de datos
import pandas as pd
import numpy as np

# Dataset
from sklearn.datasets import load_breast_cancer

# Preprocesamiento
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluación
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

# Optimización
from sklearn.model_selection import GridSearchCV

### Dataset

In [2]:
data = load_breast_cancer(as_frame=True)
df = data.frame

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### Exploración de los datos

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [4]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


#### Balanceo de clases

In [5]:
df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [6]:
df['target'].value_counts(normalize=True)

target
1    0.627417
0    0.372583
Name: proportion, dtype: float64

### Separación de datos

In [7]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


#### Escalamiento

In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Entrenamiento

In [9]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

In [10]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

### Evaluacion

In [12]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")


In [13]:
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_knn, "kNN")
evaluate_model(y_test, y_pred_rf, "Random Forest")

--- Logistic Regression ---
Accuracy: 0.9824561403508771
Precision: 0.9861111111111112
Recall: 0.9861111111111112
F1-score: 0.9861111111111112
Confusion Matrix:
 [[41  1]
 [ 1 71]]


--- kNN ---
Accuracy: 0.956140350877193
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1-score: 0.9655172413793104
Confusion Matrix:
 [[39  3]
 [ 2 70]]


--- Random Forest ---
Accuracy: 0.956140350877193
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1-score: 0.9655172413793104
Confusion Matrix:
 [[39  3]
 [ 2 70]]




## Hiperparametros

#### Regresión Logística

In [17]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}

grid_lr = GridSearchCV(
    LogisticRegression(max_iter=10000),
    param_grid_lr,
    cv=5,
    scoring='f1'
)

grid_lr.fit(X_train_scaled, y_train)

grid_lr.best_params_


{'C': 0.1, 'solver': 'lbfgs'}

In [18]:
best_lr = grid_lr.best_estimator_
y_pred_lr_opt = best_lr.predict(X_test_scaled)

evaluate_model(y_test, y_pred_lr_opt, "Logistic Regression Optimized")

--- Logistic Regression Optimized ---
Accuracy: 0.9736842105263158
Precision: 0.9726027397260274
Recall: 0.9861111111111112
F1-score: 0.9793103448275862
Confusion Matrix:
 [[40  2]
 [ 1 71]]




#### kNN

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid_knn,
    cv=5,
    scoring='f1'
)

grid_knn.fit(X_train_scaled, y_train)

grid_knn.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}

In [None]:
best_knn = grid_knn.best_estimator_
y_pred_knn_opt = best_knn.predict(X_test_scaled)

evaluate_model(y_test, y_pred_knn_opt, "kNN Optimized")

--- kNN Optimized ---
Accuracy: 0.9824561403508771
Precision: 0.972972972972973
Recall: 1.0
F1-score: 0.9863013698630136
Confusion Matrix:
 [[40  2]
 [ 0 72]]




#### Random Forest

In [21]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=5,
    scoring='f1'
)

grid_rf.fit(X_train, y_train)

grid_rf.best_params_

{'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 100}

In [22]:
best_rf = grid_rf.best_estimator_
y_pred_rf_opt = best_rf.predict(X_test)

evaluate_model(y_test, y_pred_rf_opt, "Random Forest Optimized")

--- Random Forest Optimized ---
Accuracy: 0.956140350877193
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1-score: 0.9655172413793104
Confusion Matrix:
 [[39  3]
 [ 2 70]]




## Analisis

In [23]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': best_lr.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coef_df.head()

Unnamed: 0,Feature,Coefficient
19,fractal dimension error,0.229461
15,compactness error,0.227123
9,mean fractal dimension,0.21402
18,symmetry error,0.150838
16,concavity error,0.078932


In [24]:
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

importances.head()


Unnamed: 0,Feature,Importance
23,worst area,0.147168
27,worst concave points,0.135973
20,worst radius,0.103521
7,mean concave points,0.101494
22,worst perimeter,0.075447


# Reflexión Final del Laboratorio

En este laboratorio trabajamos un problema real de clasificación binaria utilizando el dataset Breast Cancer Wisconsin. El objetivo fue entrenar distintos modelos de Machine Learning para predecir si un tumor es benigno o maligno, y analizar cuál de ellos funciona mejor considerando no solo el rendimiento numérico, sino también el impacto de los errores.

Primero exploramos el dataset para entender su estructura y verificar el balance de clases. Luego se dividieron los datos en entrenamiento y prueba, asegurando que ambas particiones mantuvieran la misma proporción de clases. También se aplicó escalamiento a los modelos que lo necesitaban (Regresión Logística y kNN), ya que estos dependen de la magnitud de las variables.

Después se entrenaron tres modelos:

- Regresión Logística  
- k-Nearest Neighbors (kNN)  
- Random Forest  

El entrenamiento consistió en permitir que cada modelo aprendiera patrones a partir de los datos de entrenamiento usando `.fit()`. Posteriormente se evaluaron en el conjunto de prueba utilizando métricas como Accuracy, Precision, Recall y F1-score.

Sin embargo, no nos quedamos solo con los valores por defecto. Se realizó una optimización de hiperparámetros usando GridSearchCV, probando diferentes configuraciones para encontrar la combinación que mejor desempeño ofreciera según el F1-score.

---

## Modelo con Mejor Desempeño

El modelo que obtuvo mejores resultados fue **kNN optimizado**, con:

- Accuracy: 0.9824  
- Precision: 0.9729  
- Recall: 1.0  
- F1-score: 0.9863  

Lo más importante es que logró **cero falsos negativos**.

En un problema médico como este, el error más grave es clasificar un tumor maligno como benigno. Ese tipo de error puede retrasar tratamiento y tener consecuencias muy serias. Por eso, más allá de la accuracy, el recall se vuelve una métrica clave.

El hecho de que el modelo tenga recall = 1.0 significa que detectó correctamente todos los casos positivos del conjunto de prueba.

---

## Sobre los Hiperparámetros

Durante la optimización se ajustaron distintos hiperparámetros:

- En Regresión Logística se ajustó el nivel de regularización.
- En kNN se probó con diferentes números de vecinos, tipos de ponderación y métricas de distancia.
- En Random Forest se ajustaron el número de árboles y su profundidad.

Los hiperparámetros influyen directamente en qué tan complejo o flexible es el modelo. Si el modelo es demasiado flexible puede sobreajustar; si es demasiado simple puede no capturar bien los patrones.

La optimización permitió encontrar un equilibrio adecuado, especialmente en el caso de kNN.

---

## Comparación General

| Modelo | Accuracy | Precision | Recall | F1-score |
|--------|----------|-----------|--------|----------|
| Logistic Regression | 0.9824 | 0.9861 | 0.9861 | 0.9861 |
| kNN | 0.9561 | 0.9589 | 0.9722 | 0.9655 |
| Random Forest | 0.9561 | 0.9589 | 0.9722 | 0.9655 |
| Logistic Regression (Opt) | 0.9737 | 0.9726 | 0.9861 | 0.9793 |
| kNN (Opt) | 0.9824 | 0.9729 | 1.0 | 0.9863 |
| Random Forest (Opt) | 0.9561 | 0.9589 | 0.9722 | 0.9655 |

Aunque varios modelos tuvieron resultados muy buenos, kNN optimizado fue el que mejor equilibró precisión y sensibilidad.

---

## Conclusión

Este laboratorio no fue solo sobre entrenar modelos, sino sobre entender cómo evaluarlos correctamente y cómo elegir el más adecuado según el contexto del problema.

Aprendimos que:

- Optimizar hiperparámetros puede marcar una gran diferencia.
- No siempre el modelo más complejo es el mejor.
- Las métricas deben elegirse según el impacto real del error.
- En aplicaciones médicas, minimizar falsos negativos es prioritario.

En general, este ejercicio permitió aplicar todo el flujo de un problema de Machine Learning supervisado, desde la exploración hasta el análisis crítico de resultados.
