<a href="https://colab.research.google.com/github/JuanBustamante107517/lab02-lp/blob/main/Untitled33.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

# metadata
print(breast_cancer_wisconsin_original.metadata)

# variable information
print(breast_cancer_wisconsin_original.variables)


{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

In [4]:
# ========================
# LETRA A: IV y split
# ========================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Cargar datos
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
cols = ["ID", "Clump_Thickness", "Uniformity_Cell_Size", "Uniformity_Cell_Shape",
        "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei",
        "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"]
df = pd.read_csv(url, names=cols)

# Preprocesamiento
df.replace("?", np.nan, inplace=True)
df["Bare_Nuclei"] = pd.to_numeric(df["Bare_Nuclei"])
df.dropna(inplace=True)
df.drop("ID", axis=1, inplace=True)
df["Class"] = df["Class"].replace({2: 0, 4: 1})  # 0: benigno, 1: maligno

# Función para IV
def calcular_iv(data, feature, target, bins=5):
    df_ = data[[feature, target]].copy()
    if df_[feature].dtype != 'object':
        df_[feature] = pd.qcut(df_[feature], q=bins, duplicates='drop')
    grouped = df_.groupby(feature, observed=False)[target].agg(['count', 'sum'])
    grouped.columns = ['Total', 'Events']
    grouped['NonEvents'] = grouped['Total'] - grouped['Events']
    grouped['%Events'] = grouped['Events'] / grouped['Events'].sum()
    grouped['%NonEvents'] = grouped['NonEvents'] / grouped['NonEvents'].sum()
    grouped['WoE'] = np.log(grouped['%Events'] / grouped['%NonEvents']).replace([np.inf, -np.inf], 0)
    grouped['IV'] = (grouped['%Events'] - grouped['%NonEvents']) * grouped['WoE']
    return grouped['IV'].sum()

# Calcular IV
iv_scores = {}
for col in df.columns.drop('Class'):
    iv_scores[col] = round(calcular_iv(df, col, 'Class'), 4)

# Filtrar variables con IV >= 0.1
iv_df = pd.DataFrame.from_dict(iv_scores, orient='index', columns=['IV']).sort_values(by='IV', ascending=False)
iv_df = iv_df[iv_df['IV'] >= 0.1]
vars_utiles = iv_df.index.tolist()

# Datos de entrada final
X = df[vars_utiles]
y = df["Class"]

# División 75% entrenamiento, 25% prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [5]:
# ========================
# LETRA B: Regresión logística
# ========================

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Estimación con statsmodels para obtener p-values
X_train_const = sm.add_constant(X_train)  # Agrega constante (intercepto)
logit_model = sm.Logit(y_train, X_train_const).fit()

print(logit_model.summary())

# Paso opcional: eliminar variables no significativas (p > 0.05)
pvals = logit_model.pvalues.drop("const")
vars_significativas = pvals[pvals <= 0.05].index.tolist()

# Reentrenar modelo con solo variables significativas
X_train_sig = X_train[vars_significativas]
X_test_sig = X_test[vars_significativas]

# Modelo con scikit-learn
clf = LogisticRegression()
clf.fit(X_train_sig, y_train)
y_pred = clf.predict(X_test_sig)
y_prob = clf.predict_proba(X_test_sig)[:, 1]

# Métricas
print("\n=== Reporte de Clasificación ===")
print(classification_report(y_test, y_pred))

print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

print(f"AUC: {roc_auc_score(y_test, y_prob):.4f}")


Optimization terminated successfully.
         Current function value: 0.076494
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                  Class   No. Observations:                  512
Model:                          Logit   Df Residuals:                      503
Method:                           MLE   Df Model:                            8
Date:                Thu, 01 May 2025   Pseudo R-squ.:                  0.8799
Time:                        00:15:03   Log-Likelihood:                -39.165
converged:                       True   LL-Null:                       -326.13
Covariance Type:            nonrobust   LLR p-value:                9.428e-119
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                          -9.6828      1.358     -7.133      0.000   

In [6]:
# ========================
# LETRA C: Modelo SVM
# ========================

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Estandarización
scaler = StandardScaler()
X_train_sig_scaled = scaler.fit_transform(X_train_sig)
X_test_sig_scaled = scaler.transform(X_test_sig)

# Modelo SVM lineal con probabilidades
svm_clf = SVC(kernel='linear', probability=True)
svm_clf.fit(X_train_sig_scaled, y_train)

# Predicciones
y_pred_svm = svm_clf.predict(X_test_sig_scaled)
y_prob_svm = svm_clf.predict_proba(X_test_sig_scaled)[:, 1]

# Métricas
print("\n=== Reporte de Clasificación SVM ===")
print(classification_report(y_test, y_pred_svm))

print("Matriz de Confusión SVM:")
print(confusion_matrix(y_test, y_pred_svm))

print(f"AUC (SVM): {roc_auc_score(y_test, y_prob_svm):.4f}")


=== Reporte de Clasificación SVM ===
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       103
           1       0.95      0.91      0.93        68

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

Matriz de Confusión SVM:
[[100   3]
 [  6  62]]
AUC (SVM): 0.9902
