In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
cancer = load_breast_cancer()
df_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)

In [None]:
y_bin = (pd.Series(cancer.target) == 0).astype(int)
df_cancer['y'] = y_bin

df_cancer_sample = df_cancer.sample(n=400, random_state=4713)

In [None]:
print("Distribución de clases (0=benigno, 1=maligno):")
print(df_cancer_sample['y'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5,4))
sns.countplot(x='y', data=df_cancer_sample, hue='y', palette='Set2', legend=False)
plt.title("Distribución de clases (0=benigno, 1=maligno)")
plt.xlabel("Clase")
plt.ylabel("Frecuencia")
plt.show()

In [None]:
cols_cmp = ['mean radius', 'mean texture', 'mean perimeter', 'mean area']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))  # 2 filas, 2 columnas

for i, c in enumerate(cols_cmp):
    ax = axes[i // 2, i % 2]  # posiciona en la grilla
    sns.histplot(
        data=df_cancer_sample,
        x=c,
        hue='y',
        bins=30,
        kde=True,
        palette='Set1',
        alpha=0.6,
        ax=ax
    )
    ax.set_title(f"Distribución de {c} por clase")
    ax.set_xlabel(c)
    ax.set_ylabel("Frecuencia")
    ax.legend(title="Clase", labels=["Benigno (0)", "Maligno (1)"])

plt.tight_layout()
plt.show()



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

cols_sel = ['mean radius', 'mean texture', 'mean perimeter', 'mean area']
X_sel = df_cancer_sample[cols_sel].values
y = df_cancer_sample['y'].values

Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.25, random_state=4713, stratify=y
)

Estandarizacion

In [None]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std  = scaler.transform(X_test)

Ajustar modelos logísticos.

In [None]:
import statsmodels.api as sm
X_train_sm = sm.add_constant(X_train_std, has_constant='add')
logit_model = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res_logit = logit_model.fit()

print(res_logit.summary())

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np

coefs = res_logit.params
or_values = np.exp(coefs)

print("Odds Ratios (exp(coef)):")
print(or_values)

In [None]:
X_test_sm = sm.add_constant(X_test_std, has_constant='add')
y_pred_prob = res_logit.predict(X_test_sm)
y_pred = (y_pred_prob >= 0.5).astype(int)

In [None]:
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Recall (clase 1 = maligno):", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Benigno (0)", "Maligno (1)"],
            yticklabels=["Benigno (0)", "Maligno (1)"])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.show()

In [None]:
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred, digits=3))