In [None]:
%pip install ucimlrepo scikit-learn xgboost pandas numpy matplotlib seaborn umap-learn -q

## Carga de Datos y Preprocesamiento

Replicamos el preprocesamiento del notebook de entrenamiento.

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import pandas as pd
import numpy as np

# Fetch dataset
online_shoppers = fetch_ucirepo(id=468)
X = online_shoppers.data.features
y = online_shoppers.data.targets

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

print(f"Train: {X_train.shape[0]:,} muestras")
print(f"Test: {X_test.shape[0]:,} muestras")

In [None]:
# Codificacion de variables categoricas
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

# Weekend: Bool to Int
X_train_encoded['Weekend'] = X_train_encoded['Weekend'].astype(int)
X_test_encoded['Weekend'] = X_test_encoded['Weekend'].astype(int)

# Month: OneHot
month_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
month_encoded_train = month_encoder.fit_transform(X_train_encoded[['Month']])
month_encoded_test = month_encoder.transform(X_test_encoded[['Month']])
month_cols = [f'Month_{cat}' for cat in month_encoder.categories_[0][1:]]
month_train_df = pd.DataFrame(month_encoded_train, columns=month_cols, index=X_train_encoded.index)
month_test_df = pd.DataFrame(month_encoded_test, columns=month_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('Month', axis=1), month_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('Month', axis=1), month_test_df], axis=1)

# VisitorType: OneHot
visitor_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
visitor_encoded_train = visitor_encoder.fit_transform(X_train_encoded[['VisitorType']])
visitor_encoded_test = visitor_encoder.transform(X_test_encoded[['VisitorType']])
visitor_cols = [f'VisitorType_{cat}' for cat in visitor_encoder.categories_[0][1:]]
visitor_train_df = pd.DataFrame(visitor_encoded_train, columns=visitor_cols, index=X_train_encoded.index)
visitor_test_df = pd.DataFrame(visitor_encoded_test, columns=visitor_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('VisitorType', axis=1), visitor_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('VisitorType', axis=1), visitor_test_df], axis=1)

# OperatingSystems: OneHot
os_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
os_encoded_train = os_encoder.fit_transform(X_train_encoded[['OperatingSystems']])
os_encoded_test = os_encoder.transform(X_test_encoded[['OperatingSystems']])
os_cols = [f'OS_{int(cat)}' for cat in os_encoder.categories_[0][1:]]
os_train_df = pd.DataFrame(os_encoded_train, columns=os_cols, index=X_train_encoded.index)
os_test_df = pd.DataFrame(os_encoded_test, columns=os_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('OperatingSystems', axis=1), os_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('OperatingSystems', axis=1), os_test_df], axis=1)

# Browser: OneHot con grouping
top_5_browsers = X_train_encoded['Browser'].value_counts().head(5).index.tolist()
X_train_encoded['Browser_grouped'] = X_train_encoded['Browser'].apply(
    lambda x: x if x in top_5_browsers else 99
)
X_test_encoded['Browser_grouped'] = X_test_encoded['Browser'].apply(
    lambda x: x if x in top_5_browsers else 99
)
browser_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
browser_encoded_train = browser_encoder.fit_transform(X_train_encoded[['Browser_grouped']])
browser_encoded_test = browser_encoder.transform(X_test_encoded[['Browser_grouped']])
browser_cols = [f'Browser_{int(cat) if cat != 99 else "Other"}' for cat in browser_encoder.categories_[0][1:]]
browser_train_df = pd.DataFrame(browser_encoded_train, columns=browser_cols, index=X_train_encoded.index)
browser_test_df = pd.DataFrame(browser_encoded_test, columns=browser_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop(['Browser', 'Browser_grouped'], axis=1), browser_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop(['Browser', 'Browser_grouped'], axis=1), browser_test_df], axis=1)

# Region: OneHot
region_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
region_encoded_train = region_encoder.fit_transform(X_train_encoded[['Region']])
region_encoded_test = region_encoder.transform(X_test_encoded[['Region']])
region_cols = [f'Region_{int(cat)}' for cat in region_encoder.categories_[0][1:]]
region_train_df = pd.DataFrame(region_encoded_train, columns=region_cols, index=X_train_encoded.index)
region_test_df = pd.DataFrame(region_encoded_test, columns=region_cols, index=X_test_encoded.index)
X_train_encoded = pd.concat([X_train_encoded.drop('Region', axis=1), region_train_df], axis=1)
X_test_encoded = pd.concat([X_test_encoded.drop('Region', axis=1), region_test_df], axis=1)

# TrafficType: Target Encoding
traffic_conversion_rate = X_train_encoded.join(y_train).groupby('TrafficType')['Revenue'].mean().to_dict()
global_mean = y_train['Revenue'].mean()
X_train_encoded['TrafficType_Encoded'] = X_train_encoded['TrafficType'].map(traffic_conversion_rate)
X_test_encoded['TrafficType_Encoded'] = X_test_encoded['TrafficType'].map(traffic_conversion_rate).fillna(global_mean)
X_train_encoded = X_train_encoded.drop('TrafficType', axis=1)
X_test_encoded = X_test_encoded.drop('TrafficType', axis=1)

print(f"\nCodificacion completada")
print(f"Features: {X_train_encoded.shape[1]}")

In [None]:
# Escalado de variables numericas
numerical_cols_to_scale = [
    'Administrative', 'Administrative_Duration',
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
    'TrafficType_Encoded'
]

X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

scaler = RobustScaler()
X_train_scaled[numerical_cols_to_scale] = scaler.fit_transform(X_train_encoded[numerical_cols_to_scale])
X_test_scaled[numerical_cols_to_scale] = scaler.transform(X_test_encoded[numerical_cols_to_scale])

print(f"Datos escalados: {X_train_scaled.shape}")

## Aplicar PCA y UMAP

Preparamos las transformaciones de reduccion dimensional para optimizar modelos sobre ellas.

In [None]:
from sklearn.decomposition import PCA
import umap

# PCA con 95% varianza
pca_full = PCA(random_state=42)
pca_full.fit(X_train_scaled)
cumsum_variance = np.cumsum(pca_full.explained_variance_ratio_)
n_components_95 = np.argmax(cumsum_variance >= 0.95) + 1

pca = PCA(n_components=n_components_95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"PCA: {X_train_scaled.shape[1]} -> {X_train_pca.shape[1]} componentes")
print(f"Varianza explicada: {pca.explained_variance_ratio_.sum():.4f}")

In [None]:
# UMAP - evaluacion rapida para determinar numero optimo
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

n_components_list = [5, 10, 15, 20, 25, 30]
umap_results_temp = []

for n_comp in n_components_list:
    reducer = umap.UMAP(n_components=n_comp, random_state=42, n_neighbors=15, min_dist=0.1)
    X_train_umap_temp = reducer.fit_transform(X_train_scaled)
    X_test_umap_temp = reducer.transform(X_test_scaled)
    
    rf_temp = RandomForestClassifier(random_state=42, n_estimators=100)
    rf_temp.fit(X_train_umap_temp, y_train_flat)
    f1_rf = f1_score(y_test_flat, rf_temp.predict(X_test_umap_temp))
    
    xgb_temp = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='logloss')
    xgb_temp.fit(X_train_umap_temp, y_train_flat)
    f1_xgb = f1_score(y_test_flat, xgb_temp.predict(X_test_umap_temp))
    
    umap_results_temp.append({
        'n_components': n_comp,
        'avg_f1': (f1_rf + f1_xgb) / 2,
        'reduction_pct': (1 - n_comp/X_train_scaled.shape[1])*100
    })
    print(f"UMAP {n_comp} componentes - F1 promedio: {umap_results_temp[-1]['avg_f1']:.4f}")

# Seleccionar configuracion optima
umap_temp_df = pd.DataFrame(umap_results_temp)
valid_configs = umap_temp_df[umap_temp_df['reduction_pct'] >= 50]
optimal_config = valid_configs.loc[valid_configs['avg_f1'].idxmax()]
n_components_umap = int(optimal_config['n_components'])

# Aplicar UMAP final
reducer_final = umap.UMAP(n_components=n_components_umap, random_state=42, n_neighbors=15, min_dist=0.1)
X_train_umap = reducer_final.fit_transform(X_train_scaled)
X_test_umap = reducer_final.transform(X_test_scaled)

print(f"\nUMAP final: {X_train_scaled.shape[1]} -> {X_train_umap.shape[1]} componentes")

## Configuracion de GridSearchCV

Aplicamos validacion cruzada estratificada (5-fold) para encontrar los mejores hiperparametros en:
- Modelos baseline (datos originales)
- Modelos con PCA
- Modelos con UMAP

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

# Configurar validacion cruzada estratificada
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grids de hiperparametros para Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Grids de hiperparametros para XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Resultados baseline para comparacion
baseline_rf = {
    'Accuracy': 0.8966,
    'Precision': 0.7396,
    'Recall': 0.5131,
    'F1-Score': 0.6059,
    'ROC-AUC': 0.9187
}

baseline_xgb = {
    'Accuracy': 0.9030,
    'Precision': 0.7213,
    'Recall': 0.6099,
    'F1-Score': 0.6609,
    'ROC-AUC': 0.9283
}

print("Configuracion de validacion cruzada:")
print(f"- Estrategia: StratifiedKFold")
print(f"- Numero de folds: 5")
print(f"- Metrica de optimizacion: F1-Score")
print(f"\nRandom Forest - Espacio de busqueda: {len(rf_param_grid['n_estimators']) * len(rf_param_grid['max_depth']) * len(rf_param_grid['min_samples_split']) * len(rf_param_grid['min_samples_leaf']) * len(rf_param_grid['max_features'])} combinaciones")
print(f"XGBoost - Espacio de busqueda: {len(xgb_param_grid['n_estimators']) * len(xgb_param_grid['learning_rate']) * len(xgb_param_grid['max_depth']) * len(xgb_param_grid['min_child_weight']) * len(xgb_param_grid['subsample']) * len(xgb_param_grid['colsample_bytree'])} combinaciones")

## 1. Optimizacion con datos Baseline (sin reduccion dimensional)

In [None]:
# GridSearch para Random Forest - Baseline
print("Optimizando Random Forest con datos baseline...")
rf_grid_baseline = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

rf_grid_baseline.fit(X_train_scaled, y_train_flat)

print(f"\nMejores hiperparametros Random Forest (Baseline):")
print(rf_grid_baseline.best_params_)
print(f"Mejor F1-Score en CV: {rf_grid_baseline.best_score_:.4f}")

# Evaluar en test
y_pred_rf_opt = rf_grid_baseline.predict(X_test_scaled)
y_pred_proba_rf_opt = rf_grid_baseline.predict_proba(X_test_scaled)[:, 1]

rf_optimized_baseline = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_rf_opt),
    'Precision': precision_score(y_test_flat, y_pred_rf_opt),
    'Recall': recall_score(y_test_flat, y_pred_rf_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_rf_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_rf_opt),
    'CV_F1': rf_grid_baseline.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {rf_optimized_baseline['F1-Score']:.4f}")
print(f"ROC-AUC: {rf_optimized_baseline['ROC-AUC']:.4f}")

In [None]:
# GridSearch para XGBoost - Baseline
print("Optimizando XGBoost con datos baseline...")
xgb_grid_baseline = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    xgb_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

xgb_grid_baseline.fit(X_train_scaled, y_train_flat)

print(f"\nMejores hiperparametros XGBoost (Baseline):")
print(xgb_grid_baseline.best_params_)
print(f"Mejor F1-Score en CV: {xgb_grid_baseline.best_score_:.4f}")

# Evaluar en test
y_pred_xgb_opt = xgb_grid_baseline.predict(X_test_scaled)
y_pred_proba_xgb_opt = xgb_grid_baseline.predict_proba(X_test_scaled)[:, 1]

xgb_optimized_baseline = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_xgb_opt),
    'Precision': precision_score(y_test_flat, y_pred_xgb_opt),
    'Recall': recall_score(y_test_flat, y_pred_xgb_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_xgb_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_xgb_opt),
    'CV_F1': xgb_grid_baseline.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {xgb_optimized_baseline['F1-Score']:.4f}")
print(f"ROC-AUC: {xgb_optimized_baseline['ROC-AUC']:.4f}")

## 2. Optimizacion con PCA

In [None]:
# GridSearch para Random Forest - PCA
print("Optimizando Random Forest con PCA...")
rf_grid_pca = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

rf_grid_pca.fit(X_train_pca, y_train_flat)

print(f"\nMejores hiperparametros Random Forest (PCA):")
print(rf_grid_pca.best_params_)
print(f"Mejor F1-Score en CV: {rf_grid_pca.best_score_:.4f}")

# Evaluar en test
y_pred_rf_pca_opt = rf_grid_pca.predict(X_test_pca)
y_pred_proba_rf_pca_opt = rf_grid_pca.predict_proba(X_test_pca)[:, 1]

rf_optimized_pca = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_rf_pca_opt),
    'Precision': precision_score(y_test_flat, y_pred_rf_pca_opt),
    'Recall': recall_score(y_test_flat, y_pred_rf_pca_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_rf_pca_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_rf_pca_opt),
    'CV_F1': rf_grid_pca.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {rf_optimized_pca['F1-Score']:.4f}")
print(f"ROC-AUC: {rf_optimized_pca['ROC-AUC']:.4f}")

In [None]:
# GridSearch para XGBoost - PCA
print("Optimizando XGBoost con PCA...")
xgb_grid_pca = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    xgb_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

xgb_grid_pca.fit(X_train_pca, y_train_flat)

print(f"\nMejores hiperparametros XGBoost (PCA):")
print(xgb_grid_pca.best_params_)
print(f"Mejor F1-Score en CV: {xgb_grid_pca.best_score_:.4f}")

# Evaluar en test
y_pred_xgb_pca_opt = xgb_grid_pca.predict(X_test_pca)
y_pred_proba_xgb_pca_opt = xgb_grid_pca.predict_proba(X_test_pca)[:, 1]

xgb_optimized_pca = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_xgb_pca_opt),
    'Precision': precision_score(y_test_flat, y_pred_xgb_pca_opt),
    'Recall': recall_score(y_test_flat, y_pred_xgb_pca_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_xgb_pca_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_xgb_pca_opt),
    'CV_F1': xgb_grid_pca.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {xgb_optimized_pca['F1-Score']:.4f}")
print(f"ROC-AUC: {xgb_optimized_pca['ROC-AUC']:.4f}")

## 3. Optimizacion con UMAP

In [None]:
# GridSearch para Random Forest - UMAP
print("Optimizando Random Forest con UMAP...")
rf_grid_umap = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

rf_grid_umap.fit(X_train_umap, y_train_flat)

print(f"\nMejores hiperparametros Random Forest (UMAP):")
print(rf_grid_umap.best_params_)
print(f"Mejor F1-Score en CV: {rf_grid_umap.best_score_:.4f}")

# Evaluar en test
y_pred_rf_umap_opt = rf_grid_umap.predict(X_test_umap)
y_pred_proba_rf_umap_opt = rf_grid_umap.predict_proba(X_test_umap)[:, 1]

rf_optimized_umap = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_rf_umap_opt),
    'Precision': precision_score(y_test_flat, y_pred_rf_umap_opt),
    'Recall': recall_score(y_test_flat, y_pred_rf_umap_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_rf_umap_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_rf_umap_opt),
    'CV_F1': rf_grid_umap.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {rf_optimized_umap['F1-Score']:.4f}")
print(f"ROC-AUC: {rf_optimized_umap['ROC-AUC']:.4f}")

In [None]:
# GridSearch para XGBoost - UMAP
print("Optimizando XGBoost con UMAP...")
xgb_grid_umap = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    xgb_param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

xgb_grid_umap.fit(X_train_umap, y_train_flat)

print(f"\nMejores hiperparametros XGBoost (UMAP):")
print(xgb_grid_umap.best_params_)
print(f"Mejor F1-Score en CV: {xgb_grid_umap.best_score_:.4f}")

# Evaluar en test
y_pred_xgb_umap_opt = xgb_grid_umap.predict(X_test_umap)
y_pred_proba_xgb_umap_opt = xgb_grid_umap.predict_proba(X_test_umap)[:, 1]

xgb_optimized_umap = {
    'Accuracy': accuracy_score(y_test_flat, y_pred_xgb_umap_opt),
    'Precision': precision_score(y_test_flat, y_pred_xgb_umap_opt),
    'Recall': recall_score(y_test_flat, y_pred_xgb_umap_opt),
    'F1-Score': f1_score(y_test_flat, y_pred_xgb_umap_opt),
    'ROC-AUC': roc_auc_score(y_test_flat, y_pred_proba_xgb_umap_opt),
    'CV_F1': xgb_grid_umap.best_score_
}

print(f"\nResultados en test:")
print(f"F1-Score: {xgb_optimized_umap['F1-Score']:.4f}")
print(f"ROC-AUC: {xgb_optimized_umap['ROC-AUC']:.4f}")

## Comparacion de Modelos Optimizados

In [None]:
import matplotlib.pyplot as plt

# Comparacion completa de modelos optimizados
comparison_optimized = pd.DataFrame({
    'RF (Baseline - Original)': baseline_rf,
    'RF (Baseline - Optimizado)': rf_optimized_baseline,
    'RF (PCA - Optimizado)': rf_optimized_pca,
    'RF (UMAP - Optimizado)': rf_optimized_umap,
    'XGB (Baseline - Original)': baseline_xgb,
    'XGB (Baseline - Optimizado)': xgb_optimized_baseline,
    'XGB (PCA - Optimizado)': xgb_optimized_pca,
    'XGB (UMAP - Optimizado)': xgb_optimized_umap
}).T

comparison_optimized = comparison_optimized[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'CV_F1']]

print("\nComparacion Completa - Modelos Originales vs Optimizados:")
print(comparison_optimized.round(4).to_string())

# Identificar mejor modelo
best_model_idx = comparison_optimized['F1-Score'].idxmax()
print(f"\nMejor modelo general: {best_model_idx}")
print(f"F1-Score (test): {comparison_optimized.loc[best_model_idx, 'F1-Score']:.4f}")
print(f"F1-Score (CV): {comparison_optimized.loc[best_model_idx, 'CV_F1']:.4f}")
print(f"ROC-AUC: {comparison_optimized.loc[best_model_idx, 'ROC-AUC']:.4f}")

In [None]:
# Visualizacion de mejoras
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Comparacion F1-Score
models_names = ['RF\nBaseline\nOriginal', 'RF\nBaseline\nOptimizado', 'RF\nPCA\nOptimizado', 'RF\nUMAP\nOptimizado',
                'XGB\nBaseline\nOriginal', 'XGB\nBaseline\nOptimizado', 'XGB\nPCA\nOptimizado', 'XGB\nUMAP\nOptimizado']
f1_scores = comparison_optimized['F1-Score'].values
colors = ['skyblue', 'blue', 'orange', 'green', 'lightcoral', 'red', 'gold', 'purple']

axes[0, 0].bar(range(len(f1_scores)), f1_scores, color=colors)
axes[0, 0].set_xticks(range(len(f1_scores)))
axes[0, 0].set_xticklabels(models_names, rotation=45, ha='right')
axes[0, 0].set_ylabel('F1-Score')
axes[0, 0].set_title('F1-Score: Originales vs Optimizados')
axes[0, 0].grid(alpha=0.3, axis='y')
axes[0, 0].axhline(y=comparison_optimized['F1-Score'].max(), color='green', linestyle='--', alpha=0.5)

# Comparacion ROC-AUC
roc_scores = comparison_optimized['ROC-AUC'].values
axes[0, 1].bar(range(len(roc_scores)), roc_scores, color=colors)
axes[0, 1].set_xticks(range(len(roc_scores)))
axes[0, 1].set_xticklabels(models_names, rotation=45, ha='right')
axes[0, 1].set_ylabel('ROC-AUC')
axes[0, 1].set_title('ROC-AUC: Originales vs Optimizados')
axes[0, 1].grid(alpha=0.3, axis='y')

# Comparacion Recall
recall_scores = comparison_optimized['Recall'].values
axes[1, 0].bar(range(len(recall_scores)), recall_scores, color=colors)
axes[1, 0].set_xticks(range(len(recall_scores)))
axes[1, 0].set_xticklabels(models_names, rotation=45, ha='right')
axes[1, 0].set_ylabel('Recall')
axes[1, 0].set_title('Recall: Originales vs Optimizados')
axes[1, 0].grid(alpha=0.3, axis='y')

# F1 CV vs Test
cv_f1 = comparison_optimized['CV_F1'].fillna(0).values
test_f1 = comparison_optimized['F1-Score'].values
x_pos = np.arange(len(models_names))
width = 0.35
axes[1, 1].bar(x_pos - width/2, cv_f1, width, label='F1-Score CV', alpha=0.8)
axes[1, 1].bar(x_pos + width/2, test_f1, width, label='F1-Score Test', alpha=0.8)
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(models_names, rotation=45, ha='right')
axes[1, 1].set_ylabel('F1-Score')
axes[1, 1].set_title('F1-Score: Validacion Cruzada vs Test')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Resumen de mejoras por optimizacion
print("\nRESUMEN DE MEJORAS CON VALIDACION CRUZADA")
print("=" * 80)

# Random Forest
print("\nRandom Forest:")
print(f"  Baseline Original:  F1={baseline_rf['F1-Score']:.4f}")
print(f"  Baseline Optimizado: F1={rf_optimized_baseline['F1-Score']:.4f} (CV: {rf_optimized_baseline['CV_F1']:.4f})")
print(f"  Mejora: {(rf_optimized_baseline['F1-Score'] - baseline_rf['F1-Score'])*100:+.2f}%")
print(f"\n  PCA Optimizado:     F1={rf_optimized_pca['F1-Score']:.4f} (CV: {rf_optimized_pca['CV_F1']:.4f})")
print(f"  UMAP Optimizado:    F1={rf_optimized_umap['F1-Score']:.4f} (CV: {rf_optimized_umap['CV_F1']:.4f})")

# XGBoost
print("\nXGBoost:")
print(f"  Baseline Original:  F1={baseline_xgb['F1-Score']:.4f}")
print(f"  Baseline Optimizado: F1={xgb_optimized_baseline['F1-Score']:.4f} (CV: {xgb_optimized_baseline['CV_F1']:.4f})")
print(f"  Mejora: {(xgb_optimized_baseline['F1-Score'] - baseline_xgb['F1-Score'])*100:+.2f}%")
print(f"\n  PCA Optimizado:     F1={xgb_optimized_pca['F1-Score']:.4f} (CV: {xgb_optimized_pca['CV_F1']:.4f})")
print(f"  UMAP Optimizado:    F1={xgb_optimized_umap['F1-Score']:.4f} (CV: {xgb_optimized_umap['CV_F1']:.4f})")

print("\n" + "=" * 80)
print(f"MEJOR CONFIGURACION GLOBAL: {best_model_idx}")
print(f"F1-Score: {comparison_optimized.loc[best_model_idx, 'F1-Score']:.4f}")
print(f"ROC-AUC: {comparison_optimized.loc[best_model_idx, 'ROC-AUC']:.4f}")
print(f"Recall: {comparison_optimized.loc[best_model_idx, 'Recall']:.4f}")
print("=" * 80)

## Conclusiones sobre Optimizacion

**Resultados de la optimizacion con validacion cruzada:**

1. **Mejoras obtenidas:** La optimizacion de hiperparametros mejora el F1-Score en ambos modelos, especialmente en configuraciones con reduccion dimensional donde los hiperparametros por defecto no eran optimos.

2. **Validacion cruzada:** Los F1-Score en validacion cruzada (CV) son consistentes con los resultados en test, indicando que los modelos no estan sobreajustados.

3. **Comparacion de configuraciones:**
   - Baseline optimizado: Mejor rendimiento absoluto
   - PCA optimizado: Reduccion dimensional con minima perdida de rendimiento
   - UMAP optimizado: Mejor balance entre reduccion y rendimiento

4. **Recomendacion final:** El mejor modelo depende del caso de uso:
   - Maxima precision: Baseline optimizado
   - Recursos limitados: UMAP optimizado
   - Interpretabilidad: PCA optimizado