In [1]:
%pip install ucimlrepo matplotlib seaborn pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo

In [3]:
# fetch dataset
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468)

In [4]:
# data (as pandas dataframes)
X = online_shoppers_purchasing_intention_dataset.data.features
y = online_shoppers_purchasing_intention_dataset.data.targets

In [5]:
# metadata
print(online_shoppers_purchasing_intention_dataset.metadata)

{'uci_id': 468, 'name': 'Online Shoppers Purchasing Intention Dataset', 'repository_url': 'https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/468/data.csv', 'abstract': 'Of the 12,330 sessions in the dataset,\n84.5% (10,422) were negative class samples that did not\nend with shopping, and the rest (1908) were positive class\nsamples ending with shopping.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 12330, 'num_features': 17, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Revenue'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Thu Jan 11 2024', 'dataset_doi': '10.24432/C5F88Q', 'creators': ['C. Sakar', 'Yomi Kastro'], 'intro_paper': {'ID': 367, 'type': 'NATIVE', 'title': 'Real-time prediction of online shoppers’ pur

In [6]:
# variable information
print(online_shoppers_purchasing_intention_dataset.variables)

                       name     role         type demographic description  \
0            Administrative  Feature      Integer        None        None   
1   Administrative_Duration  Feature      Integer        None        None   
2             Informational  Feature      Integer        None        None   
3    Informational_Duration  Feature      Integer        None        None   
4            ProductRelated  Feature      Integer        None        None   
5   ProductRelated_Duration  Feature   Continuous        None        None   
6               BounceRates  Feature   Continuous        None        None   
7                 ExitRates  Feature   Continuous        None        None   
8                PageValues  Feature      Integer        None        None   
9                SpecialDay  Feature      Integer        None        None   
10                    Month  Feature  Categorical        None        None   
11         OperatingSystems  Feature      Integer        None        None   

# FASE 1: PREPARACIÓN DE DATOS

## Paso 1.1: División Train/Test (Estratificada)

Usamos una configuración de 80% entrenamiento y 20% test, de esta forma queda de la siguiente manera: 
- 9,864 train / 2,466 test
- Desbalance 85/15: ~370 casos positivos en test (suficiente para evaluación confiable)
- Suficientes datos para entrenar y aplicar SMOTE posteriormente

Además, con la estratificación se mantiene la proporción 85/15 en ambos conjuntos

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20,           # Definimos el 20% para test
    random_state=42,          # Reproducibilidad
    stratify=y                # Mantiene proporción de clases
)

print(f"Tamaño de los conjuntos:")
print(f"Train: {X_train.shape[0]:,} muestras ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test:  {X_test.shape[0]:,} muestras ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nDistribución de clases en TRAIN:")
train_counts = y_train['Revenue'].value_counts()
print(f"False (No compra): {train_counts[False]:,} ({train_counts[False]/len(y_train)*100:.2f}%)")
print(f"True (Compra):     {train_counts[True]:,} ({train_counts[True]/len(y_train)*100:.2f}%)")

print(f"\nDistribución de clases en TEST:")
test_counts = y_test['Revenue'].value_counts()
print(f"False (No compra): {test_counts[False]:,} ({test_counts[False]/len(y_test)*100:.2f}%)")
print(f"True (Compra):     {test_counts[True]:,} ({test_counts[True]/len(y_test)*100:.2f}%)")

Tamaño de los conjuntos:
Train: 9,864 muestras (80.0%)
Test:  2,466 muestras (20.0%)

Distribución de clases en TRAIN:
False (No compra): 8,338 (84.53%)
True (Compra):     1,526 (15.47%)

Distribución de clases en TEST:
False (No compra): 2,084 (84.51%)
True (Compra):     382 (15.49%)


## Paso 1.2: Codificación de variables

In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

# 1. WEEKEND: Bool → Int
X_train_encoded['Weekend'] = X_train_encoded['Weekend'].astype(int)
X_test_encoded['Weekend'] = X_test_encoded['Weekend'].astype(int)

# 2. MONTH: OneHot
month_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
month_encoded_train = month_encoder.fit_transform(X_train_encoded[['Month']])
month_encoded_test = month_encoder.transform(X_test_encoded[['Month']])
month_cols = [f'Month_{cat}' for cat in month_encoder.categories_[0][1:]]
month_train_df = pd.DataFrame(month_encoded_train, columns=month_cols, index=X_train_encoded.index)
month_test_df = pd.DataFrame(month_encoded_test, columns=month_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('Month', axis=1), month_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('Month', axis=1), month_test_df], axis=1)

# 3. VISITORTYPE: OneHot
visitor_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
visitor_encoded_train = visitor_encoder.fit_transform(X_train_encoded[['VisitorType']])
visitor_encoded_test = visitor_encoder.transform(X_test_encoded[['VisitorType']])
visitor_cols = [f'VisitorType_{cat}' for cat in visitor_encoder.categories_[0][1:]]
visitor_train_df = pd.DataFrame(visitor_encoded_train, columns=visitor_cols, index=X_train_encoded.index)
visitor_test_df = pd.DataFrame(visitor_encoded_test, columns=visitor_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('VisitorType', axis=1), visitor_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('VisitorType', axis=1), visitor_test_df], axis=1)

# 4. OPERATINGSYSTEMS: OneHot
os_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
os_encoded_train = os_encoder.fit_transform(X_train_encoded[['OperatingSystems']])
os_encoded_test = os_encoder.transform(X_test_encoded[['OperatingSystems']])
os_cols = [f'OS_{int(cat)}' for cat in os_encoder.categories_[0][1:]]
os_train_df = pd.DataFrame(os_encoded_train, columns=os_cols, index=X_train_encoded.index)
os_test_df = pd.DataFrame(os_encoded_test, columns=os_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('OperatingSystems', axis=1), os_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('OperatingSystems', axis=1), os_test_df], axis=1)

# 5. BROWSER: OneHot con Grouping (Top 5 + Other)
top_5_browsers = X_train_encoded['Browser'].value_counts().head(5).index.tolist()
X_train_encoded['Browser_grouped'] = X_train_encoded['Browser'].apply(
    lambda x: x if x in top_5_browsers else 99
)
X_test_encoded['Browser_grouped'] = X_test_encoded['Browser'].apply(
    lambda x: x if x in top_5_browsers else 99
)
browser_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
browser_encoded_train = browser_encoder.fit_transform(X_train_encoded[['Browser_grouped']])
browser_encoded_test = browser_encoder.transform(X_test_encoded[['Browser_grouped']])
browser_cols = [f'Browser_{int(cat) if cat != 99 else "Other"}' for cat in browser_encoder.categories_[0][1:]]
browser_train_df = pd.DataFrame(browser_encoded_train, columns=browser_cols, index=X_train_encoded.index)
browser_test_df = pd.DataFrame(browser_encoded_test, columns=browser_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop(['Browser', 'Browser_grouped'], axis=1), browser_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop(['Browser', 'Browser_grouped'], axis=1), browser_test_df], axis=1)

# 6. REGION: OneHot
region_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
region_encoded_train = region_encoder.fit_transform(X_train_encoded[['Region']])
region_encoded_test = region_encoder.transform(X_test_encoded[['Region']])
region_cols = [f'Region_{int(cat)}' for cat in region_encoder.categories_[0][1:]]
region_train_df = pd.DataFrame(region_encoded_train, columns=region_cols, index=X_train_encoded.index)
region_test_df = pd.DataFrame(region_encoded_test, columns=region_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('Region', axis=1), region_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('Region', axis=1), region_test_df], axis=1)

# 7. TRAFFICTYPE: Target Encoding
traffic_conversion_rate = X_train_encoded.join(y_train).groupby('TrafficType')['Revenue'].mean().to_dict()
global_mean = y_train['Revenue'].mean()
X_train_encoded['TrafficType_Encoded'] = X_train_encoded['TrafficType'].map(traffic_conversion_rate)
X_test_encoded['TrafficType_Encoded'] = X_test_encoded['TrafficType'].map(traffic_conversion_rate).fillna(global_mean)
X_train_encoded = X_train_encoded.drop('TrafficType', axis=1)
X_test_encoded = X_test_encoded.drop('TrafficType', axis=1)

print(f"Codificación completada")
print(f"X_train: {X_train_encoded.shape}")
print(f"X_test:  {X_test_encoded.shape}")

Codificación completada
X_train: (9864, 43)
X_test:  (2466, 43)


### Resumen de Codificación Completada

**Transformaciones aplicadas:**

| Variable Original | Estrategia | Columnas Generadas | Justificación |
|------------------|------------|-------------------|---------------|
| **Month** | OneHot (drop first) | 9 | Sin orden natural, captura estacionalidad |
| **VisitorType** | OneHot (drop first) | 2 | Solo 3 categorías nominales |
| **Weekend** | Bool → Int | 1 | Ya binaria, solo conversión |
| **OperatingSystems** | OneHot (drop first) | 7 | 8 valores manejables |
| **Browser** | OneHot + Grouping | 5 | Top 5 + "Other" (reducido de 13) |
| **Region** | OneHot (drop first) | 8 | 9 valores geográficos |
| **TrafficType** | Target Encoding | 1 | 20 valores → 1 numérica |

**Resultado:**
- Features originales: 17
- Features después de codificación: **43** (vs 73 con OneHot completo)
- Reducción de dimensionalidad: 41% menos features
- Todos los encoders ajustados SOLO con train

## Paso 1.3: Escalado de Variables Numéricas

Usaremos **RobustScaler** porque en el análisis exploratorio se vieron muchos outliers en variables de duración.

In [9]:
from sklearn.preprocessing import RobustScaler

numerical_cols_to_scale = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
    'TrafficType_Encoded'
]

X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

scaler = RobustScaler()
X_train_scaled[numerical_cols_to_scale] = scaler.fit_transform(X_train_encoded[numerical_cols_to_scale])
X_test_scaled[numerical_cols_to_scale] = scaler.transform(X_test_encoded[numerical_cols_to_scale])

print(f"X_train_scaled: {X_train_scaled.shape}")
print(f"X_test_scaled:  {X_test_scaled.shape}")

X_train_scaled: (9864, 43)
X_test_scaled:  (2466, 43)


### Resumen de la fase 1: Preparación de Datos

| Paso | Acción |
|------|--------|
| 1.1 | División Train/Test (80/20 estratificado) |
| 1.2 | Codificación de variables categóricas |
| 1.3 | Escalado de variables numéricas (RobustScaler) |

**Datasets listos para entrenamiento:**
- `X_train_scaled`: 9,864 muestras × 43 features
- `X_test_scaled`: 2,466 muestras × 43 features
- `y_train`: 9,864 etiquetas (84.5% No compra, 15.5% Compra)
- `y_test`: 2,466 etiquetas (84.5% No compra, 15.5% Compra)

# FASE 2: MODELO BASELINE

## Paso 2.1: Entrenamiento de Modelos sin Balanceo

Entrenamos varios modelos con los datos desbalanceados (84.5% No compra / 15.5% Compra) para establecer una línea base, esto tambiem nos permitirá comparar el efecto de SMOTE posteriormente.

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import pandas as pd
import time

# Aplanar y_train y y_test para compatibilidad
y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

# Diccionario para almacenar resultados
baseline_results = {}

# Modelos a entrenar
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=2000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    start_time = time.time()
    
    # Entrenar modelo
    model.fit(X_train_scaled, y_train_flat)
    
    # Predicciones
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Métricas
    accuracy = accuracy_score(y_test_flat, y_pred)
    precision = precision_score(y_test_flat, y_pred)
    recall = recall_score(y_test_flat, y_pred)
    f1 = f1_score(y_test_flat, y_pred)
    roc_auc = roc_auc_score(y_test_flat, y_pred_proba) if y_pred_proba is not None else None
    
    training_time = time.time() - start_time
    
    # Resultados
    baseline_results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'Training Time (s)': training_time,
        'Model': model
    }
    
    roc_auc_str = f"{roc_auc:.4f}" if roc_auc is not None else "N/A"

## Paso 2.2: Evaluación y Comparación de Resultados

In [11]:
results_df = pd.DataFrame(baseline_results).T
results_df = results_df[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'Training Time (s)']]
results_df = results_df.round(4)

print(results_df.to_string())

# Identificar mejor modelo por F1-Score (ya que es más apropiado para datos desbalanceados)
best_model_name = results_df['F1-Score'].idxmax()
print(f"\nMejor modelo (por F1-Score): {best_model_name}")
print(f"F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")
print(f"ROC-AUC: {results_df.loc[best_model_name, 'ROC-AUC']:.4f}")

                     Accuracy Precision    Recall  F1-Score   ROC-AUC Training Time (s)
Logistic Regression  0.882401  0.755556  0.356021  0.483986  0.886398           1.51645
Decision Tree        0.856853  0.537662  0.541885  0.539765  0.728236          0.085332
Random Forest        0.896594  0.739623  0.513089  0.605873  0.918734            1.2663
SVM                   0.88159  0.692308  0.424084  0.525974  0.851294          8.077825
KNN                  0.886456  0.675862  0.513089  0.583333  0.848243           0.38414

Mejor modelo (por F1-Score): Random Forest
F1-Score: 0.6059
ROC-AUC: 0.9187


In [12]:
# Matriz de confusión y reporte detallado del mejor modelo
best_model = baseline_results[best_model_name]['Model']
y_pred_best = best_model.predict(X_test_scaled)

print(f"Análisis de {best_model_name}")

# Matriz de confusión
cm = confusion_matrix(y_test_flat, y_pred_best)
print("\nMatriz de Confusión:")
print(f"                  Predicho: No Compra | Predicho: Compra")
print(f"Real: No Compra          {cm[0][0]:6d}     |     {cm[0][1]:6d}")
print(f"Real: Compra             {cm[1][0]:6d}     |     {cm[1][1]:6d}")

# Reporte de clasificación
print("\nReporte de Clasificación:")
print(classification_report(y_test_flat, y_pred_best, target_names=['No Compra', 'Compra']))

Análisis de Random Forest

Matriz de Confusión:
                  Predicho: No Compra | Predicho: Compra
Real: No Compra            2015     |         69
Real: Compra                186     |        196

Reporte de Clasificación:
              precision    recall  f1-score   support

   No Compra       0.92      0.97      0.94      2084
      Compra       0.74      0.51      0.61       382

    accuracy                           0.90      2466
   macro avg       0.83      0.74      0.77      2466
weighted avg       0.89      0.90      0.89      2466



### Resumen FASE 2: Modelo Baseline

**Resultados obtenidos Sin balanceo de clases:**

| Modelo | Accuracy | Precision | Recall | F1-Score | ROC-AUC |
|--------|----------|-----------|--------|----------|---------|
| **Random Forest** | **0.8966** | **0.7396** | **0.5131** | **0.6059** | **0.9187** |
| KNN | 0.8865 | 0.6759 | 0.5131 | 0.5833 | 0.8482 |
| Logistic Regression | 0.8824 | 0.7556 | 0.3560 | 0.4840 | 0.8862 |
| SVM | 0.8816 | 0.6923 | 0.4241 | 0.5260 | 0.8513 |
| Decision Tree | 0.8569 | 0.5377 | 0.5419 | 0.5398 | 0.7282 |

**Análisis:**

1. Random Forest es el mejor modelo con F1=0.6059 y ROC-AUC=0.9187
2. Problema del desbalanceo es evidente:
   - Alta accuracy (89.7%) pero bajo recall (51.3%) para clase positiva
   - El modelo predice bien "No Compra" (97% recall) pero falla en "Compra" (51% recall)
   - 186 falsos negativos (casi la mitad de las compras no detectadas)

# FASE 3: SMOTE PROGRESIVO

## Paso 3.1: Aplicación de SMOTE Incremental

Se aplica SMOTE incrementalmente generando 5%, 10% y 15% adicional de muestras de la clase minoritaria. SMOTE se aplica SOLO en el conjunto de entrenamiento.

**Estado actual:**
- Clase minoritaria (Compra): 1,526 muestras (15.47%)
- Clase mayoritaria (No Compra): 8,338 muestras (84.53%)

In [13]:
# Instalar imbalanced-learn si no está instalado
%pip install imbalanced-learn -q

Note: you may need to restart the kernel to use updated packages.


In [18]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Estado actual de las clases en train
current_minority = (y_train_flat == 1).sum()  # 1526
current_majority = (y_train_flat == 0).sum()  # 8338
total_train = len(y_train_flat)

# Hacemos el Calculo sampling_strategy para cada nivel de SMOTE
# sampling_strategy = num_samples_minority / num_samples_majority

# SMOTE 5%
smote_5_samples = current_minority + int(total_train * 0.05)
strategy_5 = smote_5_samples / current_majority

# SMOTE 10%
smote_10_samples = current_minority + int(total_train * 0.10)
strategy_10 = smote_10_samples / current_majority

# SMOTE 15%
smote_15_samples = current_minority + int(total_train * 0.15)
strategy_15 = smote_15_samples / current_majority

print(f"\nSMOTE 5%:")
print(f"- Muestras minoritarias: {smote_5_samples} ({smote_5_samples/(current_majority+smote_5_samples)*100:.2f}%)")
print(f"- sampling_strategy: {strategy_5:.4f}")

print(f"\n SMOTE 10%:")
print(f"- Muestras minoritarias: {smote_10_samples} ({smote_10_samples/(current_majority+smote_10_samples)*100:.2f}%)")
print(f"- sampling_strategy: {strategy_10:.4f}")

print(f"\nSMOTE 15%:")
print(f"- Muestras minoritarias: {smote_15_samples} ({smote_15_samples/(current_majority+smote_15_samples)*100:.2f}%)")
print(f"- sampling_strategy: {strategy_15:.4f}")

# Se aplica SMOTE para cada nivel
smote_configs = {
    'SMOTE 5%': (strategy_5, smote_5_samples),
    'SMOTE 10%': (strategy_10, smote_10_samples),
    'SMOTE 15%': (strategy_15, smote_15_samples)
}

smote_datasets = {}

for name, (strategy, expected_samples) in smote_configs.items():
    smote = SMOTE(sampling_strategy=strategy, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train_flat)
    
    smote_datasets[name] = {
        'X': X_resampled,
        'y': y_resampled,
        'minority_count': (y_resampled == 1).sum(),
        'majority_count': (y_resampled == 0).sum(),
        'total': len(y_resampled)
    }
    
    print(f"\n{name}:")
    print(f"Total muestras: {smote_datasets[name]['total']}")
    print(f"Compra (1): {smote_datasets[name]['minority_count']} ({smote_datasets[name]['minority_count']/smote_datasets[name]['total']*100:.2f}%)")
    print(f"No Compra (0): {smote_datasets[name]['majority_count']} ({smote_datasets[name]['majority_count']/smote_datasets[name]['total']*100:.2f}%)")



SMOTE 5%:
- Muestras minoritarias: 2019 (19.49%)
- sampling_strategy: 0.2421

 SMOTE 10%:
- Muestras minoritarias: 2512 (23.15%)
- sampling_strategy: 0.3013

SMOTE 15%:
- Muestras minoritarias: 3005 (26.49%)
- sampling_strategy: 0.3604

SMOTE 5%:
Total muestras: 10357
Compra (1): 2019 (19.49%)
No Compra (0): 8338 (80.51%)

SMOTE 10%:
Total muestras: 10850
Compra (1): 2512 (23.15%)
No Compra (0): 8338 (76.85%)

SMOTE 15%:
Total muestras: 11343
Compra (1): 3005 (26.49%)
No Compra (0): 8338 (73.51%)


## Paso 3.2: Entrenamiento con SMOTE y Evaluación

Entrenaremos los 5 modelos con cada configuración de SMOTE y compararemos con el baseline.

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

smote_results = {}

models_smote = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=2000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Entrenamos con cada configuración de SMOTE
for smote_name, smote_data in smote_datasets.items():
    print(f"{smote_name}")
    
    X_smote = smote_data['X']
    y_smote = smote_data['y']
    
    smote_results[smote_name] = {}
    
    for model_name, model in models_smote.items():
        start_time = time.time()
        
        # Entrenar
        model.fit(X_smote, y_smote)
        
        # Predecir en test (Recordar que el test es sin SMOTE)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Métricas
        accuracy = accuracy_score(y_test_flat, y_pred)
        precision = precision_score(y_test_flat, y_pred)
        recall = recall_score(y_test_flat, y_pred)
        f1 = f1_score(y_test_flat, y_pred)
        roc_auc = roc_auc_score(y_test_flat, y_pred_proba) if y_pred_proba is not None else None
        
        training_time = time.time() - start_time
        
        smote_results[smote_name][model_name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'ROC-AUC': roc_auc,
            'Training Time (s)': training_time
        }
        
        roc_auc_str = f"{roc_auc:.4f}" if roc_auc is not None else "N/A"
        print(f"  {model_name:20s} | F1: {f1:.4f} | Recall: {recall:.4f} | ROC-AUC: {roc_auc_str}")


SMOTE 5%
  Logistic Regression  | F1: 0.5240 | Recall: 0.4136 | ROC-AUC: 0.8876
  Decision Tree        | F1: 0.5181 | Recall: 0.5236 | ROC-AUC: 0.7162
  Logistic Regression  | F1: 0.5240 | Recall: 0.4136 | ROC-AUC: 0.8876
  Decision Tree        | F1: 0.5181 | Recall: 0.5236 | ROC-AUC: 0.7162
  Random Forest        | F1: 0.6372 | Recall: 0.5654 | ROC-AUC: 0.9166
  Random Forest        | F1: 0.6372 | Recall: 0.5654 | ROC-AUC: 0.9166
  SVM                  | F1: 0.5644 | Recall: 0.4817 | ROC-AUC: 0.8688
  KNN                  | F1: 0.5863 | Recall: 0.5733 | ROC-AUC: 0.8437
SMOTE 10%
  SVM                  | F1: 0.5644 | Recall: 0.4817 | ROC-AUC: 0.8688
  KNN                  | F1: 0.5863 | Recall: 0.5733 | ROC-AUC: 0.8437
SMOTE 10%
  Logistic Regression  | F1: 0.5639 | Recall: 0.4738 | ROC-AUC: 0.8878
  Decision Tree        | F1: 0.5160 | Recall: 0.5288 | ROC-AUC: 0.7167
  Logistic Regression  | F1: 0.5639 | Recall: 0.4738 | ROC-AUC: 0.8878
  Decision Tree        | F1: 0.5160 | Recall: 0.

## Paso 3.3: Comparación de Resultados

In [22]:
# Comparar Random Forest (mejor modelo baseline) en diferentes configuraciones
import pandas as pd

comparison_data = []

# Baseline
comparison_data.append({
    'Configuración': 'Baseline (Sin SMOTE)',
    'Accuracy': baseline_results['Random Forest']['Accuracy'],
    'Precision': baseline_results['Random Forest']['Precision'],
    'Recall': baseline_results['Random Forest']['Recall'],
    'F1-Score': baseline_results['Random Forest']['F1-Score'],
    'ROC-AUC': baseline_results['Random Forest']['ROC-AUC']
})

# SMOTE 5%, 10%, 15%
for smote_name in ['SMOTE 5%', 'SMOTE 10%', 'SMOTE 15%']:
    comparison_data.append({
        'Configuración': smote_name,
        'Accuracy': smote_results[smote_name]['Random Forest']['Accuracy'],
        'Precision': smote_results[smote_name]['Random Forest']['Precision'],
        'Recall': smote_results[smote_name]['Random Forest']['Recall'],
        'F1-Score': smote_results[smote_name]['Random Forest']['F1-Score'],
        'ROC-AUC': smote_results[smote_name]['Random Forest']['ROC-AUC']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)

print(comparison_df.to_string(index=False))

# Identificar mejor configuración
best_f1_idx = comparison_df['F1-Score'].idxmax()
best_config = comparison_df.loc[best_f1_idx, 'Configuración']
best_f1 = comparison_df.loc[best_f1_idx, 'F1-Score']
best_recall = comparison_df.loc[best_f1_idx, 'Recall']

print(f"\nMejor configuración: {best_config}")
print(f"F1-Score: {best_f1:.4f}")
print(f"Recall: {best_recall:.4f}")

# Calcular mejora vs baseline
baseline_f1 = comparison_df.loc[0, 'F1-Score']
baseline_recall = comparison_df.loc[0, 'Recall']
improvement_f1 = ((best_f1 - baseline_f1) / baseline_f1) * 100
improvement_recall = ((best_recall - baseline_recall) / baseline_recall) * 100

print(f"\nMejora vs Baseline:")
print(f"F1-Score: +{improvement_f1:.2f}%")
print(f"Recall: +{improvement_recall:.2f}%")


       Configuración  Accuracy  Precision  Recall  F1-Score  ROC-AUC
Baseline (Sin SMOTE)    0.8966     0.7396  0.5131    0.6059   0.9187
            SMOTE 5%    0.9002     0.7297  0.5654    0.6372   0.9166
           SMOTE 10%    0.9011     0.7331  0.5681    0.6401   0.9174
           SMOTE 15%    0.8954     0.6962  0.5759    0.6304   0.9160

Mejor configuración: SMOTE 10%
F1-Score: 0.6401
Recall: 0.5681

Mejora vs Baseline:
F1-Score: +5.64%
Recall: +10.72%


### Resumen FASE 3: SMOTE

**Resultados de Random Forest con diferentes niveles de SMOTE:**

| Configuración | Accuracy | Precision | Recall | F1-Score | ROC-AUC | Mejora F1 | Mejora Recall |
|---------------|----------|-----------|--------|----------|---------|-----------|---------------|
| **Baseline** | 0.8966 | 0.7396 | 0.5131 | **0.6059** | 0.9187 | - | - |
| **SMOTE 5%** | 0.9002 | 0.7297 | 0.5654 | 0.6372 | 0.9166 | +5.16% | +10.19% |
| **SMOTE 10%** | **0.9011** | **0.7331** | **0.5681** | **0.6401** | **0.9174** | **+5.64%** | **+10.72%** |
| **SMOTE 15%** | 0.8954 | 0.6962 | 0.5759 | 0.6304 | 0.9160 | +4.04% | +12.24% |

**Análisis:**

1. **SMOTE 10% es la mejor configuración:**
   - F1-Score mejoró 5.64% (0.6059 → 0.6401)
   - Recall mejoró 10.72% (51.31% → 56.81%)
   - Mantiene buen balance entre precision y recall

2. **Observaciones:**
   - SMOTE 5% mejora solo un poco los resultados
   - SMOTE 10% logra el mejor balance
   - SMOTE 15% mejora recall pero baja precision (overfitting leve)