# Entrenamiento y Evaluación de Modelos de Regresión
Este notebook contiene el entrenamiento y evaluación de modelos de machine learning para regresión.

## Modelos a evaluar:
1. **Regresión Logística Multivariable** (LinearRegression)
2. **Support Vector Regression (SVR)**
3. **XGBoost Regressor**

## Objetivos:
- Entrenar modelos con diferentes hiperparámetros
- Evaluar rendimiento con métricas de regresión
- Analizar importancia de características
- Comparar modelos y seleccionar el mejor

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, validation_curve
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import xgboost as xgb
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Librerías importadas correctamente")

## 1. Carga y Preparación de Datos

In [None]:
# Cargar el dataset
# NOTA: Reemplaza 'tu_dataset.csv' con la ruta real de tu archivo
file_path = 'tu_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print(f"Dataset cargado exitosamente: {df.shape[0]} filas y {df.shape[1]} columnas")
except FileNotFoundError:
    print("Archivo no encontrado. Utilizando dataset de ejemplo.")
    # Crear dataset de ejemplo para demostración con 10 características
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'feature_1': np.random.normal(50, 15, n_samples),
        'feature_2': np.random.exponential(2, n_samples),
        'feature_3': np.random.uniform(0, 100, n_samples),
        'feature_4': np.random.gamma(2, 2, n_samples),
        'feature_5': np.random.beta(2, 5, n_samples) * 100,
        'feature_6': np.random.lognormal(1, 0.5, n_samples),
        'feature_7': np.random.weibull(1.5, n_samples) * 50,
        'feature_8': np.random.poisson(5, n_samples),
        'feature_9': np.random.triangular(0, 50, 100, n_samples),
        'feature_10': np.random.pareto(3, n_samples) * 10,
        'target': np.random.normal(75, 20, n_samples)
    })
    # Agregar correlación artificial con múltiples características
    df['target'] = (0.25 * df['feature_1'] + 0.15 * df['feature_3'] + 0.1 * df['feature_4'] + 
                   0.08 * df['feature_6'] + 0.12 * df['feature_7'] + 0.05 * df['feature_9'] + 
                   np.random.normal(0, 10, n_samples))
    print(f"Dataset de ejemplo creado: {df.shape[0]} filas y {df.shape[1]} columnas")

# Verificar si existe la columna target
if 'target' not in df.columns:
    print("⚠️ ADVERTENCIA: No se encontró la columna 'target'. Por favor, especifica cuál es tu variable objetivo.")
    print(f"Columnas disponibles: {list(df.columns)}")
else:
    print(f"✅ Variable objetivo 'target' encontrada")

In [None]:
# Preparación de datos
# Separar características y variable objetivo
feature_cols = [col for col in df.columns if col != 'target']
X = df[feature_cols]
y = df['target']

print(f"Características seleccionadas: {feature_cols}")
print(f"Forma de X: {X.shape}")
print(f"Forma de y: {y.shape}")

# Verificar valores faltantes
missing_X = X.isnull().sum().sum()
missing_y = y.isnull().sum()
print(f"\nValores faltantes en X: {missing_X}")
print(f"Valores faltantes en y: {missing_y}")

# Remover filas con valores faltantes si existen
if missing_X > 0 or missing_y > 0:
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]
    print(f"Después de limpiar datos: X={X.shape}, y={y.shape}")

In [None]:
# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Conjunto de entrenamiento: {X_train.shape[0]} muestras")
print(f"Conjunto de prueba: {X_test.shape[0]} muestras")
print(f"Proporción train/test: {X_train.shape[0]/X_test.shape[0]:.1f}")

# Estadísticas de la variable objetivo
print(f"\nEstadísticas de la variable objetivo:")
print(f"Train - Media: {y_train.mean():.2f}, Std: {y_train.std():.2f}")
print(f"Test  - Media: {y_test.mean():.2f}, Std: {y_test.std():.2f}")

## 2. Escalado de Características

In [None]:
# Preparar diferentes scalers
scalers = {
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler(),
    'NoScaler': None
}

# Función para aplicar escalado
def apply_scaling(scaler, X_train, X_test):
    if scaler is None:
        return X_train, X_test
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

print("Escaladores preparados:")
for name in scalers.keys():
    print(f"  • {name}")

## 3. Definición de Métricas de Evaluación

In [None]:
def calculate_metrics(y_true, y_pred):
    """Calcula múltiples métricas de regresión"""
    metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R²': r2_score(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100,
        'Adjusted_R²': 1 - (1 - r2_score(y_true, y_pred)) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1)
    }
    return metrics

def print_metrics(metrics, title="Métricas"):
    """Imprime métricas de forma organizada"""
    print(f"\n=== {title} ===")
    for metric, value in metrics.items():
        if metric in ['R²', 'Adjusted_R²']:
            print(f"{metric}: {value:.4f}")
        elif metric == 'MAPE':
            print(f"{metric}: {value:.2f}%")
        else:
            print(f"{metric}: {value:.4f}")

print("Funciones de métricas definidas")

## 4. Modelo 1: Regresión Lineal con Regularización

In [None]:
print("🔍 ENTRENANDO MODELOS DE REGRESIÓN LINEAL")

# Modelos de regresión lineal a probar
linear_models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(max_iter=2000),
    'ElasticNet': ElasticNet(max_iter=2000)
}

# Hiperparámetros para búsqueda
linear_params = {
    'Ridge': {'alpha': [0.1, 1, 10, 100, 1000]},
    'Lasso': {'alpha': [0.001, 0.01, 0.1, 1, 10]},
    'ElasticNet': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    }
}

# Resultados de modelos lineales
linear_results = {}

# Usar StandardScaler para modelos lineales
scaler = StandardScaler()
X_train_scaled, X_test_scaled = apply_scaling(scaler, X_train, X_test)

for model_name, model in linear_models.items():
    print(f"\n🔧 Entrenando {model_name}...")
    
    if model_name in linear_params:
        # Búsqueda de hiperparámetros con validación cruzada
        grid_search = GridSearchCV(
            model, linear_params[model_name], 
            cv=5, scoring='neg_mean_squared_error', 
            n_jobs=-1
        )
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
        print(f"  Mejores hiperparámetros: {grid_search.best_params_}")
    else:
        # Modelo sin hiperparámetros
        best_model = model
        best_model.fit(X_train_scaled, y_train)
    
    # Predicciones
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)
    
    # Métricas
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    
    # Validación cruzada
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    cv_rmse_std = np.sqrt(cv_scores.std())
    
    linear_results[model_name] = {
        'model': best_model,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'cv_rmse': cv_rmse,
        'cv_rmse_std': cv_rmse_std,
        'predictions': {'train': y_train_pred, 'test': y_test_pred}
    }
    
    print_metrics(train_metrics, "Entrenamiento")
    print_metrics(test_metrics, "Prueba")
    print(f"CV RMSE: {cv_rmse:.4f} (±{cv_rmse_std:.4f})")

print("\n✅ Modelos lineales entrenados correctamente")

## 5. Modelo 2: Support Vector Regression (SVR)

In [None]:
print("🔍 ENTRENANDO SUPPORT VECTOR REGRESSION")

# Hiperparámetros para SVR
svr_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.01, 0.1, 0.2, 0.5]
}

# Diferentes kernels a probar
svr_kernels = ['linear', 'rbf', 'poly']
svr_results = {}

# Usar RobustScaler para SVR (más robusto a outliers)
robust_scaler = RobustScaler()
X_train_robust, X_test_robust = apply_scaling(robust_scaler, X_train, X_test)

for kernel in svr_kernels:
    print(f"\n🔧 Entrenando SVR con kernel {kernel}...")
    
    # Crear modelo SVR
    svr_model = SVR(kernel=kernel)
    
    # Ajustar parámetros según el kernel
    current_params = svr_params.copy()
    if kernel == 'linear':
        current_params.pop('gamma')  # Gamma no es relevante para kernel lineal
    elif kernel == 'poly':
        current_params['degree'] = [2, 3, 4]  # Agregar grado para polynomial
    
    # Búsqueda de hiperparámetros
    grid_search = GridSearchCV(
        svr_model, current_params, 
        cv=5, scoring='neg_mean_squared_error', 
        n_jobs=-1, verbose=0
    )
    
    grid_search.fit(X_train_robust, y_train)
    best_svr = grid_search.best_estimator_
    
    print(f"  Mejores hiperparámetros: {grid_search.best_params_}")
    
    # Predicciones
    y_train_pred = best_svr.predict(X_train_robust)
    y_test_pred = best_svr.predict(X_test_robust)
    
    # Métricas
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    
    # Validación cruzada
    cv_scores = cross_val_score(best_svr, X_train_robust, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    cv_rmse_std = np.sqrt(cv_scores.std())
    
    svr_results[f'SVR_{kernel}'] = {
        'model': best_svr,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'cv_rmse': cv_rmse,
        'cv_rmse_std': cv_rmse_std,
        'predictions': {'train': y_train_pred, 'test': y_test_pred}
    }
    
    print_metrics(train_metrics, "Entrenamiento")
    print_metrics(test_metrics, "Prueba")
    print(f"CV RMSE: {cv_rmse:.4f} (±{cv_rmse_std:.4f})")

print("\n✅ Modelos SVR entrenados correctamente")

## 6. Modelo 3: XGBoost Regressor

In [None]:
print("🔍 ENTRENANDO XGBOOST REGRESSOR")

# Hiperparámetros para XGBoost
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Crear modelo XGBoost
xgb_model = xgb.XGBRegressor(
    random_state=42,
    verbosity=0,
    eval_metric='rmse'
)

# XGBoost no requiere escalado, usar datos originales
print("🔧 Entrenando XGBoost (puede tardar varios minutos)...")

# Búsqueda aleatoria para reducir tiempo de cómputo
from sklearn.model_selection import RandomizedSearchCV

# Usar RandomizedSearchCV en lugar de GridSearchCV para mayor eficiencia
random_search = RandomizedSearchCV(
    xgb_model, xgb_params,
    n_iter=50,  # Número de combinaciones a probar
    cv=5, scoring='neg_mean_squared_error',
    n_jobs=-1, random_state=42,
    verbose=0
)

random_search.fit(X_train, y_train)
best_xgb = random_search.best_estimator_

print(f"  Mejores hiperparámetros: {random_search.best_params_}")

# Predicciones
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

# Métricas
train_metrics = calculate_metrics(y_train, y_train_pred)
test_metrics = calculate_metrics(y_test, y_test_pred)

# Validación cruzada
cv_scores = cross_val_score(best_xgb, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())
cv_rmse_std = np.sqrt(cv_scores.std())

xgb_results = {
    'model': best_xgb,
    'train_metrics': train_metrics,
    'test_metrics': test_metrics,
    'cv_rmse': cv_rmse,
    'cv_rmse_std': cv_rmse_std,
    'predictions': {'train': y_train_pred, 'test': y_test_pred}
}

print_metrics(train_metrics, "Entrenamiento")
print_metrics(test_metrics, "Prueba")
print(f"CV RMSE: {cv_rmse:.4f} (±{cv_rmse_std:.4f})")

print("\n✅ Modelo XGBoost entrenado correctamente")

## 7. Comparación de Modelos

In [None]:
# Combinar todos los resultados
all_results = {**linear_results, **svr_results, 'XGBoost': xgb_results}

# Crear DataFrame de comparación
comparison_data = []
for model_name, results in all_results.items():
    comparison_data.append({
        'Modelo': model_name,
        'Train_RMSE': results['train_metrics']['RMSE'],
        'Test_RMSE': results['test_metrics']['RMSE'],
        'Train_R²': results['train_metrics']['R²'],
        'Test_R²': results['test_metrics']['R²'],
        'Train_MAE': results['train_metrics']['MAE'],
        'Test_MAE': results['test_metrics']['MAE'],
        'CV_RMSE': results['cv_rmse'],
        'CV_RMSE_Std': results['cv_rmse_std'],
        'Overfitting': results['train_metrics']['RMSE'] - results['test_metrics']['RMSE']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test_R²', ascending=False)

print("=== COMPARACIÓN DE MODELOS ===")
display(comparison_df.round(4))

# Identificar el mejor modelo
best_model_name = comparison_df.iloc[0]['Modelo']
print(f"\n🏆 MEJOR MODELO: {best_model_name}")
print(f"   Test R²: {comparison_df.iloc[0]['Test_R²']:.4f}")
print(f"   Test RMSE: {comparison_df.iloc[0]['Test_RMSE']:.4f}")
print(f"   CV RMSE: {comparison_df.iloc[0]['CV_RMSE']:.4f} (±{comparison_df.iloc[0]['CV_RMSE_Std']:.4f})")

In [None]:
# Visualización de comparación de modelos
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# R² Score
ax1 = axes[0, 0]
x_pos = np.arange(len(comparison_df))
ax1.bar(x_pos, comparison_df['Test_R²'], alpha=0.7, color='lightblue')
ax1.set_xlabel('Modelos')
ax1.set_ylabel('R² Score')
ax1.set_title('R² Score en Test Set')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(comparison_df['Modelo'], rotation=45)
ax1.grid(True, alpha=0.3)

# RMSE
ax2 = axes[0, 1]
ax2.bar(x_pos, comparison_df['Test_RMSE'], alpha=0.7, color='lightcoral')
ax2.set_xlabel('Modelos')
ax2.set_ylabel('RMSE')
ax2.set_title('RMSE en Test Set')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(comparison_df['Modelo'], rotation=45)
ax2.grid(True, alpha=0.3)

# Overfitting (Train RMSE - Test RMSE)
ax3 = axes[1, 0]
colors = ['red' if x > 0 else 'green' for x in comparison_df['Overfitting']]
ax3.bar(x_pos, comparison_df['Overfitting'], alpha=0.7, color=colors)
ax3.set_xlabel('Modelos')
ax3.set_ylabel('Overfitting (Train RMSE - Test RMSE)')
ax3.set_title('Análisis de Overfitting')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(comparison_df['Modelo'], rotation=45)
ax3.grid(True, alpha=0.3)
ax3.axhline(y=0, color='black', linestyle='--', alpha=0.5)

# Cross-Validation RMSE con barras de error
ax4 = axes[1, 1]
ax4.bar(x_pos, comparison_df['CV_RMSE'], alpha=0.7, color='lightgreen',
        yerr=comparison_df['CV_RMSE_Std'], capsize=5)
ax4.set_xlabel('Modelos')
ax4.set_ylabel('CV RMSE')
ax4.set_title('Cross-Validation RMSE')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(comparison_df['Modelo'], rotation=45)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Análisis de Importancia de Características

In [None]:
print("🔍 ANALIZANDO IMPORTANCIA DE CARACTERÍSTICAS")

# Función para obtener importancia de características
def get_feature_importance(model, model_name, X_test, y_test):
    """Obtiene importancia de características según el tipo de modelo"""
    
    if hasattr(model, 'feature_importances_'):
        # Modelos con feature_importances_ (XGBoost, RandomForest, etc.)
        importance = model.feature_importances_
        method = 'Built-in Feature Importance'
    
    elif hasattr(model, 'coef_'):
        # Modelos lineales
        importance = np.abs(model.coef_)
        method = 'Absolute Coefficients'
    
    else:
        # Usar permutation importance para otros modelos
        perm_importance = permutation_importance(
            model, X_test, y_test, 
            n_repeats=10, random_state=42, 
            scoring='neg_mean_squared_error'
        )
        importance = perm_importance.importances_mean
        method = 'Permutation Importance'
    
    return importance, method

# Analizar importancia para los 3 mejores modelos
top_3_models = comparison_df.head(3)
importance_results = {}

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, (_, row) in enumerate(top_3_models.iterrows()):
    model_name = row['Modelo']
    model_obj = all_results[model_name]['model']
    
    # Preparar datos de test según el modelo
    if model_name.startswith('SVR'):
        X_test_prepared = robust_scaler.transform(X_test)
    elif model_name in ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']:
        X_test_prepared = scaler.transform(X_test)
    else:  # XGBoost
        X_test_prepared = X_test
    
    importance, method = get_feature_importance(model_obj, model_name, X_test_prepared, y_test)
    
    # Normalizar importancia
    importance_normalized = importance / importance.sum()
    
    # Crear DataFrame para visualización
    importance_df = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': importance_normalized
    }).sort_values('Importance', ascending=True)
    
    importance_results[model_name] = importance_df
    
    # Gráfico
    ax = axes[i]
    colors = plt.cm.viridis(np.linspace(0, 1, len(importance_df)))
    bars = ax.barh(importance_df['Feature'], importance_df['Importance'], color=colors)
    ax.set_title(f'{model_name}\n({method})')
    ax.set_xlabel('Importancia Normalizada')
    ax.grid(True, alpha=0.3)
    
    # Añadir valores en las barras
    for bar, importance in zip(bars, importance_df['Importance']):
        ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height()/2, 
                f'{importance:.3f}', ha='left', va='center', fontsize=9)

plt.tight_layout()
plt.show()

# Imprimir ranking de características
print("\n=== RANKING DE IMPORTANCIA DE CARACTERÍSTICAS ===")
for model_name, imp_df in importance_results.items():
    print(f"\n{model_name}:")
    for i, (_, row) in enumerate(imp_df.sort_values('Importance', ascending=False).iterrows(), 1):
        print(f"  {i}. {row['Feature']}: {row['Importance']:.4f}")

## 9. Análisis de Residuos del Mejor Modelo

In [None]:
# Seleccionar el mejor modelo para análisis detallado
best_model_results = all_results[best_model_name]
best_model = best_model_results['model']
y_train_pred = best_model_results['predictions']['train']
y_test_pred = best_model_results['predictions']['test']

print(f"🔍 ANÁLISIS DE RESIDUOS - {best_model_name}")

# Calcular residuos
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

# Crear gráficos de análisis de residuos
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Predicciones vs Valores Reales (Train)
ax1 = axes[0, 0]
ax1.scatter(y_train, y_train_pred, alpha=0.6, s=20)
min_val = min(y_train.min(), y_train_pred.min())
max_val = max(y_train.max(), y_train_pred.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
ax1.set_xlabel('Valores Reales')
ax1.set_ylabel('Predicciones')
ax1.set_title('Train: Predicciones vs Reales')
ax1.grid(True, alpha=0.3)

# 2. Predicciones vs Valores Reales (Test)
ax2 = axes[0, 1]
ax2.scatter(y_test, y_test_pred, alpha=0.6, s=20, color='orange')
min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
ax2.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
ax2.set_xlabel('Valores Reales')
ax2.set_ylabel('Predicciones')
ax2.set_title('Test: Predicciones vs Reales')
ax2.grid(True, alpha=0.3)

# 3. Residuos vs Predicciones (Test)
ax3 = axes[0, 2]
ax3.scatter(y_test_pred, test_residuals, alpha=0.6, s=20, color='green')
ax3.axhline(y=0, color='r', linestyle='--')
ax3.set_xlabel('Predicciones')
ax3.set_ylabel('Residuos')
ax3.set_title('Residuos vs Predicciones (Test)')
ax3.grid(True, alpha=0.3)

# 4. Histograma de Residuos (Train)
ax4 = axes[1, 0]
ax4.hist(train_residuals, bins=30, alpha=0.7, density=True, color='skyblue')
ax4.axvline(train_residuals.mean(), color='red', linestyle='--', label=f'Media: {train_residuals.mean():.3f}')
ax4.set_xlabel('Residuos')
ax4.set_ylabel('Densidad')
ax4.set_title('Distribución de Residuos (Train)')
ax4.legend()
ax4.grid(True, alpha=0.3)

# 5. Histograma de Residuos (Test)
ax5 = axes[1, 1]
ax5.hist(test_residuals, bins=30, alpha=0.7, density=True, color='orange')
ax5.axvline(test_residuals.mean(), color='red', linestyle='--', label=f'Media: {test_residuals.mean():.3f}')
ax5.set_xlabel('Residuos')
ax5.set_ylabel('Densidad')
ax5.set_title('Distribución de Residuos (Test)')
ax5.legend()
ax5.grid(True, alpha=0.3)

# 6. Q-Q Plot de Residuos (Test)
ax6 = axes[1, 2]
stats.probplot(test_residuals, dist="norm", plot=ax6)
ax6.set_title('Q-Q Plot de Residuos (Test)')
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Estadísticas de residuos
print(f"\n=== ESTADÍSTICAS DE RESIDUOS ===")
print(f"Train:")
print(f"  Media: {train_residuals.mean():.6f}")
print(f"  Std: {train_residuals.std():.4f}")
print(f"  Skewness: {stats.skew(train_residuals):.4f}")
print(f"  Kurtosis: {stats.kurtosis(train_residuals):.4f}")

print(f"\nTest:")
print(f"  Media: {test_residuals.mean():.6f}")
print(f"  Std: {test_residuals.std():.4f}")
print(f"  Skewness: {stats.skew(test_residuals):.4f}")
print(f"  Kurtosis: {stats.kurtosis(test_residuals):.4f}")

# Test de normalidad de residuos
_, p_value = stats.shapiro(test_residuals[:1000] if len(test_residuals) > 1000 else test_residuals)
print(f"\nTest de Normalidad (Shapiro-Wilk): p-value = {p_value:.6f}")
print(f"Residuos son normales: {'Sí' if p_value > 0.05 else 'No'}")

## 10. Curvas de Aprendizaje y Validación

In [None]:
from sklearn.model_selection import learning_curve

print(f"🔍 GENERANDO CURVAS DE APRENDIZAJE PARA {best_model_name}")

# Preparar datos según el mejor modelo
if best_model_name.startswith('SVR'):
    X_for_curves = robust_scaler.fit_transform(X_train)
elif best_model_name in ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']:
    X_for_curves = scaler.fit_transform(X_train)
else:  # XGBoost
    X_for_curves = X_train

# Generar curvas de aprendizaje
train_sizes, train_scores, val_scores = learning_curve(
    best_model, X_for_curves, y_train,
    cv=5, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='neg_mean_squared_error'
)

# Convertir a RMSE
train_rmse = np.sqrt(-train_scores)
val_rmse = np.sqrt(-val_scores)

# Calcular medias y desviaciones estándar
train_rmse_mean = train_rmse.mean(axis=1)
train_rmse_std = train_rmse.std(axis=1)
val_rmse_mean = val_rmse.mean(axis=1)
val_rmse_std = val_rmse.std(axis=1)

# Gráfico de curvas de aprendizaje
plt.figure(figsize=(12, 8))
plt.plot(train_sizes, train_rmse_mean, 'o-', color='blue', label='Training RMSE')
plt.fill_between(train_sizes, train_rmse_mean - train_rmse_std, 
                 train_rmse_mean + train_rmse_std, alpha=0.1, color='blue')

plt.plot(train_sizes, val_rmse_mean, 'o-', color='red', label='Validation RMSE')
plt.fill_between(train_sizes, val_rmse_mean - val_rmse_std, 
                 val_rmse_mean + val_rmse_std, alpha=0.1, color='red')

plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('RMSE')
plt.title(f'Curvas de Aprendizaje - {best_model_name}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Análisis de las curvas
final_gap = val_rmse_mean[-1] - train_rmse_mean[-1]
print(f"\n=== ANÁLISIS DE CURVAS DE APRENDIZAJE ===")
print(f"RMSE final en entrenamiento: {train_rmse_mean[-1]:.4f} (±{train_rmse_std[-1]:.4f})")
print(f"RMSE final en validación: {val_rmse_mean[-1]:.4f} (±{val_rmse_std[-1]:.4f})")
print(f"Gap final (Val - Train): {final_gap:.4f}")

if final_gap > 0.1 * train_rmse_mean[-1]:
    print("⚠️ Posible overfitting detectado")
elif final_gap < 0:
    print("⚠️ Posible underfitting o datos de validación más fáciles")
else:
    print("✅ Buen balance entre sesgo y varianza")

## 11. Optimización de Pesos para Maximizar Salida

In [None]:
from scipy.optimize import minimize, differential_evolution
from sklearn.preprocessing import MinMaxScaler

print("🎯 OPTIMIZACIÓN DE PESOS PARA MAXIMIZAR SALIDA")

# Función objetivo para maximizar la predicción
def objective_function(weights, model, scaler=None, feature_names=None):
    """Función objetivo: queremos maximizar la salida del modelo"""
    weights_reshaped = weights.reshape(1, -1)
    
    # Aplicar escalado si es necesario
    if scaler is not None:
        weights_scaled = scaler.transform(weights_reshaped)
    else:
        weights_scaled = weights_reshaped
    
    # Predecir (queremos maximizar, así que retornamos el negativo)
    prediction = model.predict(weights_scaled)[0]
    return -prediction  # Negativo porque minimize busca el mínimo

# Definir límites para las características basados en los datos
feature_bounds = []
print("\nLímites de características basados en los datos:")
for i, feature in enumerate(feature_cols):
    min_val = X[feature].min()
    max_val = X[feature].max()
    feature_bounds.append((min_val, max_val))
    print(f"  {feature}: [{min_val:.2f}, {max_val:.2f}]")

# Optimizar para el mejor modelo
print(f"\n🔧 Optimizando con {best_model_name}...")

# Preparar escalador si es necesario
optimization_scaler = None
if best_model_name.startswith('SVR'):
    optimization_scaler = RobustScaler().fit(X_train)
elif best_model_name in ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']:
    optimization_scaler = StandardScaler().fit(X_train)

# Usar differential evolution (más robusto para problemas no convexos)
result = differential_evolution(
    objective_function,
    feature_bounds,
    args=(best_model, optimization_scaler, feature_cols),
    seed=42,
    maxiter=1000,
    popsize=15
)

optimal_weights = result.x
optimal_prediction = -result.fun  # Convertir de vuelta (quitamos el negativo)

print(f"\n✅ OPTIMIZACIÓN COMPLETADA")
print(f"Valor máximo predicho: {optimal_prediction:.4f}")
print(f"Número de evaluaciones: {result.nfev}")
print(f"Éxito: {result.success}")

# Mostrar pesos óptimos
print(f"\n=== PESOS ÓPTIMOS PARA MAXIMIZAR SALIDA ===")
optimal_df = pd.DataFrame({
    'Característica': feature_cols,
    'Valor_Óptimo': optimal_weights,
    'Min_Datos': [X[col].min() for col in feature_cols],
    'Max_Datos': [X[col].max() for col in feature_cols],
    'Percentil_en_Datos': [stats.percentileofscore(X[col], val) for col, val in zip(feature_cols, optimal_weights)]
})

display(optimal_df.round(4))

# Comparar con algunos ejemplos del dataset
print(f"\n=== COMPARACIÓN CON DATOS EXISTENTES ===")
# Encontrar las 5 muestras con mayor valor de target
top_5_indices = y.nlargest(5).index
top_5_predictions = []

for idx in top_5_indices:
    sample = X.loc[idx].values.reshape(1, -1)
    if optimization_scaler is not None:
        sample_scaled = optimization_scaler.transform(sample)
    else:
        sample_scaled = sample
    pred = best_model.predict(sample_scaled)[0]
    top_5_predictions.append(pred)

comparison_df = pd.DataFrame({
    'Tipo': ['Top 1 del dataset', 'Top 2 del dataset', 'Top 3 del dataset', 'Top 4 del dataset', 'Top 5 del dataset', 'PESOS ÓPTIMOS'],
    'Predicción': top_5_predictions + [optimal_prediction],
    'Target_Real': list(y.loc[top_5_indices]) + ['N/A']
})

display(comparison_df.round(4))

improvement = optimal_prediction - max(top_5_predictions)
print(f"\nMejora sobre el mejor caso del dataset: {improvement:.4f} ({improvement/max(top_5_predictions)*100:.2f}%)")

## 12. Visualización de Pesos Óptimos

In [None]:
# Visualización de pesos óptimos
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Valores óptimos vs rangos de datos
ax1 = axes[0, 0]
x_pos = np.arange(len(feature_cols))
ax1.bar(x_pos, optimal_weights, alpha=0.7, color='gold', label='Valores Óptimos')

# Agregar líneas para min/max de datos
mins = [X[col].min() for col in feature_cols]
maxs = [X[col].max() for col in feature_cols]
ax1.errorbar(x_pos, optimal_weights, 
            yerr=[np.array(optimal_weights) - np.array(mins), 
                  np.array(maxs) - np.array(optimal_weights)], 
            fmt='none', color='red', alpha=0.5, capsize=5)

ax1.set_xlabel('Características')
ax1.set_ylabel('Valores')
ax1.set_title('Valores Óptimos vs Rangos de Datos')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(feature_cols, rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Percentiles de los valores óptimos
ax2 = axes[0, 1]
percentiles = optimal_df['Percentil_en_Datos']
colors = ['red' if p > 90 else 'orange' if p > 75 else 'yellow' if p > 50 else 'lightblue' for p in percentiles]
bars = ax2.bar(x_pos, percentiles, color=colors, alpha=0.7)
ax2.axhline(y=50, color='gray', linestyle='--', alpha=0.7, label='Mediana')
ax2.axhline(y=90, color='red', linestyle='--', alpha=0.7, label='Percentil 90')
ax2.set_xlabel('Características')
ax2.set_ylabel('Percentil')
ax2.set_title('Percentiles de Valores Óptimos')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(feature_cols, rotation=45)
ax2.legend()
ax2.grid(True, alpha=0.3)

# Agregar etiquetas con percentiles
for bar, p in zip(bars, percentiles):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            f'{p:.0f}', ha='center', va='bottom', fontsize=9)

# 3. Comparación con mejores muestras del dataset
ax3 = axes[1, 0]
comparison_values = list(comparison_df['Predicción'][:-1]) + [optimal_prediction]
labels = ['Top 1', 'Top 2', 'Top 3', 'Top 4', 'Top 5', 'Óptimo']
colors = ['lightblue'] * 5 + ['gold']

bars = ax3.bar(labels, comparison_values, color=colors, alpha=0.7)
ax3.set_xlabel('Muestras')
ax3.set_ylabel('Predicción')
ax3.set_title('Comparación: Mejores Muestras vs Óptimo')
ax3.grid(True, alpha=0.3)

# Agregar etiquetas con valores
for bar, val in zip(bars, comparison_values):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(comparison_values)*0.01, 
            f'{val:.2f}', ha='center', va='bottom', fontsize=9)

# 4. Radar chart comparando óptimo vs promedio del dataset
ax4 = axes[1, 1]
angles = np.linspace(0, 2*np.pi, len(feature_cols), endpoint=False).tolist()
angles += angles[:1]  # Completar el círculo

# Normalizar valores para el radar chart
scaler_radar = MinMaxScaler()
data_for_radar = np.column_stack([optimal_weights, X[feature_cols].mean().values])
normalized_data = scaler_radar.fit_transform(data_for_radar)

optimal_normalized = normalized_data[:, 0].tolist()
mean_normalized = normalized_data[:, 1].tolist()

optimal_normalized += optimal_normalized[:1]
mean_normalized += mean_normalized[:1]

ax4 = plt.subplot(2, 2, 4, projection='polar')
ax4.plot(angles, optimal_normalized, 'o-', linewidth=2, label='Valores Óptimos', color='gold')
ax4.fill(angles, optimal_normalized, alpha=0.25, color='gold')
ax4.plot(angles, mean_normalized, 'o-', linewidth=2, label='Promedio Dataset', color='blue')
ax4.fill(angles, mean_normalized, alpha=0.25, color='blue')

ax4.set_xticks(angles[:-1])
ax4.set_xticklabels(feature_cols)
ax4.set_ylim(0, 1)
ax4.set_title('Comparación Radar: Óptimo vs Promedio', y=1.08)
ax4.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))

plt.tight_layout()
plt.show()

## 13. Resumen Final y Recomendaciones

In [None]:
print("="*80)
print("🎯 RESUMEN FINAL DEL PROYECTO DE MACHINE LEARNING")
print("="*80)

print(f"\n📊 DATASET:")
print(f"   • Tamaño: {X.shape[0]} muestras, {X.shape[1]} características")
print(f"   • Características: {', '.join(feature_cols)}")
print(f"   • Split: {X_train.shape[0]} train / {X_test.shape[0]} test")

print(f"\n🤖 MODELOS EVALUADOS:")
models_tested = len(all_results)
print(f"   • Total de modelos: {models_tested}")
print(f"   • Regresión Lineal: {len(linear_results)} variantes")
print(f"   • Support Vector Regression: {len(svr_results)} kernels")
print(f"   • XGBoost: 1 modelo con hiperparámetros optimizados")

print(f"\n🏆 MEJOR MODELO: {best_model_name}")
best_test_metrics = best_model_results['test_metrics']
print(f"   • R² Score: {best_test_metrics['R²']:.4f}")
print(f"   • RMSE: {best_test_metrics['RMSE']:.4f}")
print(f"   • MAE: {best_test_metrics['MAE']:.4f}")
print(f"   • MAPE: {best_test_metrics['MAPE']:.2f}%")
print(f"   • CV RMSE: {best_model_results['cv_rmse']:.4f} (±{best_model_results['cv_rmse_std']:.4f})")

print(f"\n🎯 CARACTERÍSTICAS MÁS IMPORTANTES:")
if best_model_name in importance_results:
    top_features = importance_results[best_model_name].sort_values('Importance', ascending=False).head(3)
    for i, (_, row) in enumerate(top_features.iterrows(), 1):
        print(f"   {i}. {row['Feature']}: {row['Importance']:.4f}")

print(f"\n⚡ PESOS ÓPTIMOS PARA MAXIMIZACIÓN:")
print(f"   • Valor máximo predicho: {optimal_prediction:.4f}")
print(f"   • Mejora sobre mejor muestra: {improvement:.4f} ({improvement/max(top_5_predictions)*100:.2f}%)")
print(f"   • Características que deben maximizarse:")
high_percentile_features = optimal_df[optimal_df['Percentil_en_Datos'] > 75]
for _, row in high_percentile_features.iterrows():
    print(f"     - {row['Característica']}: {row['Valor_Óptimo']:.2f} (percentil {row['Percentil_en_Datos']:.0f})")

print(f"\n📈 ANÁLISIS DE CALIDAD DEL MODELO:")
overfitting_score = best_model_results['train_metrics']['RMSE'] - best_model_results['test_metrics']['RMSE']
if abs(overfitting_score) < 0.1 * best_model_results['test_metrics']['RMSE']:
    print(f"   • ✅ Buen balance entre sesgo y varianza")
elif overfitting_score > 0:
    print(f"   • ⚠️ Ligero overfitting detectado (diferencia: {overfitting_score:.4f})")
else:
    print(f"   • ⚠️ Posible underfitting")

if best_test_metrics['R²'] > 0.8:
    quality = "Excelente"
elif best_test_metrics['R²'] > 0.6:
    quality = "Bueno"
elif best_test_metrics['R²'] > 0.4:
    quality = "Regular"
else:
    quality = "Necesita mejora"
    
print(f"   • Calidad del ajuste: {quality} (R² = {best_test_metrics['R²']:.4f})")

print(f"\n🔬 RECOMENDACIONES:")
print(f"   • Para maximizar la salida, use los pesos óptimos encontrados")
print(f"   • Foque en las características de mayor importancia identificadas")

if best_test_metrics['R²'] < 0.8:
    print(f"   • Considere feature engineering adicional para mejorar el R²")
    print(f"   • Explore modelos más complejos (Neural Networks, Ensemble methods)")
    
if abs(overfitting_score) > 0.1 * best_model_results['test_metrics']['RMSE']:
    print(f"   • Considere técnicas de regularización adicionales")
    print(f"   • Aumente el tamaño del dataset de entrenamiento si es posible")

print(f"   • Use validación cruzada para decisiones de producción")
print(f"   • Monitoree el rendimiento del modelo en datos nuevos")

print(f"\n💾 ARCHIVOS GENERADOS:")
print(f"   • 01_exploratory_data_analysis.ipynb: Análisis exploratorio completo")
print(f"   • 02_model_training_evaluation.ipynb: Entrenamiento y evaluación de modelos")

print("\n" + "="*80)
print("✨ PROYECTO COMPLETADO EXITOSAMENTE ✨")
print("="*80)