# XGS-PON project - ML demo con otro dataset
# Dataset: Breast Cancer Wisconsin (sklearn)

In [None]:


import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# 1) Cargar dataset (sin archivos externos)
data = load_breast_cancer()
X_df = pd.DataFrame(data.data, columns=data.feature_names)
y_sr = pd.Series(data.target, name="target")  # 0 = malignant, 1 = benign

print("Tamaño del dataset:", X_df.shape)
print("Clases:", dict(zip(data.target_names, np.bincount(data.target))))
print(X_df.head())

# 2) División de datos (estratificada)
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_sr, test_size=0.2, random_state=42, stratify=y_sr
)

# 3) Selección de características con Random Forest
rf_base = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_base.fit(X_train, y_train)

selector = SelectFromModel(rf_base, prefit=True, threshold="median")
X_train_sel = selector.transform(X_train)
X_test_sel  = selector.transform(X_test)

selected_mask = selector.get_support()
selected_features = X_df.columns[selected_mask].tolist()

print(f"Características originales: {X_train.shape[1]}")
print(f"Características seleccionadas: {X_train_sel.shape[1]}")
print("Features seleccionadas:", selected_features)

# (Opcional) Importancias ordenadas de las features originales
importances = pd.Series(rf_base.feature_importances_, index=X_df.columns).sort_values(ascending=False)
print("\nTop 10 importancias de características:")
print(importances.head(10))

# 4) Búsqueda de hiperparámetros con validación cruzada
param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 8, 12, 16],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_sel, y_train)

print("\nMejores hiperparámetros:")
print(grid.best_params_)
print("Mejor accuracy CV:", round(grid.best_score_, 4))

# 5) Evaluación en test
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test_sel)

print("\nAccuracy en test:", round(accuracy_score(y_test, y_pred), 4))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=data.target_names))

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred))


Tamaño del dataset: (569, 30)
Clases: {np.str_('malignant'): np.int64(212), np.str_('benign'): np.int64(357)}
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430  



Características originales: 30
Características seleccionadas: 15
Features seleccionadas: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean compactness', 'mean concavity', 'mean concave points', 'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst compactness', 'worst concavity', 'worst concave points']

Top 10 importancias de características:
worst perimeter         0.133100
worst area              0.128052
worst concave points    0.108107
mean concave points     0.094414
worst radius            0.090639
mean radius             0.058662
mean perimeter          0.055242
mean area               0.049938
mean concavity          0.046207
worst concavity         0.035357
dtype: float64
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
