Comparación Modelo Superficial vs Modelo Profundo


In [None]:

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from scipy import stats

# --- Ajusta la ruta a tu csv ---
CSV_PATH = r"D:\MAXITEL\Escritorio\ginger\MAESTRIA\INTELIGENCIA ARTIFICIAL\proyecto-ml-telecom\Supervisado\ocrdataset.csv"

# 1) Cargar
df = pd.read_csv(CSV_PATH)
print("Shape inicial:", df.shape)

# 2) Verifica existencia de la columna target
target = "Signal Quality"
if target not in df.columns:
    raise ValueError(f"No se encontró la columna target '{target}' en el CSV. Columnas disponibles: {df.columns.tolist()}")

# 3) Filtrar clases raras
vc = df[target].value_counts()
keep_classes = vc[vc >= 2].index
df = df[df[target].isin(keep_classes)].reset_index(drop=True)

# 4) X, y y codificación
X = df.drop(columns=[target])

# Convertir automáticamente todas las variables categóricas (strings) a numéricas
X = pd.get_dummies(X, drop_first=True)

y_raw = df[target].values
le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Shape final de X:", X.shape)
print("Clases finales:", le.classes_)

# 5) Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6) Escalado: fit en train, transform en test (evita data leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7) Entrenamiento: Random Forest
rf = RandomForestClassifier(
    n_estimators=300, max_depth=15, class_weight="balanced",
    random_state=42, n_jobs=-1
)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
rf_proba = rf.predict_proba(X_test_scaled)[:, 1]

print("=== Random Forest ===")
print(classification_report(y_test, rf_pred, target_names=le.classes_))

# 8) Entrenamiento: MLP
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64), activation="relu",
    solver="adam", learning_rate_init=0.001, max_iter=500,
    random_state=42
)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)
mlp_proba = mlp.predict_proba(X_test_scaled)[:, 1]

print("=== Red Neuronal (MLP) ===")
print(classification_report(y_test, mlp_pred, target_names=le.classes_))

# 9) Matrices de confusión
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.heatmap(confusion_matrix(y_test, rf_pred), annot=True, fmt="d", ax=ax[0])
ax[0].set_title("Random Forest")
sns.heatmap(confusion_matrix(y_test, mlp_pred), annot=True, fmt="d", ax=ax[1])
ax[1].set_title("MLP")
plt.show()

# 10) ROC y Precision-Recall
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
RocCurveDisplay.from_predictions(y_test, rf_proba, name="Random Forest", ax=ax[0])
RocCurveDisplay.from_predictions(y_test, mlp_proba, name="MLP", ax=ax[0])
ax[0].set_title("ROC")
PrecisionRecallDisplay.from_predictions(y_test, rf_proba, name="Random Forest", ax=ax[1])
PrecisionRecallDisplay.from_predictions(y_test, mlp_proba, name="MLP", ax=ax[1])
ax[1].set_title("Precision-Recall")
plt.show()

# 11) Validación cruzada y prueba estadística — usar pipeline para evitar data leakage en CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_pipe = make_pipeline(StandardScaler(), RandomForestClassifier(
    n_estimators=300, max_depth=15, class_weight="balanced", random_state=42, n_jobs=-1
))
mlp_pipe = make_pipeline(StandardScaler(), MLPClassifier(
    hidden_layer_sizes=(128, 64), max_iter=500, random_state=42
))

rf_scores = cross_val_score(rf_pipe, X, y, cv=cv, scoring="f1_macro")
mlp_scores = cross_val_score(mlp_pipe, X, y, cv=cv, scoring="f1_macro")

t_stat, p_val = stats.ttest_rel(mlp_scores, rf_scores)

print("=== Comparación estadística ===")
print("RF F1 promedio:", np.mean(rf_scores), "std:", np.std(rf_scores))
print("MLP F1 promedio:", np.mean(mlp_scores), "std:", np.std(mlp_scores))
print("t-stat:", t_stat, "p-value:", p_val)