In [7]:
import pandas as pd
import joblib
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 1. Cargar datos
df = pd.read_csv("../01_generacion_datos/clientes_sinteticos.csv")
X = df[["presupuesto_mensual", "visitas_web_mensuales", "interacciones_redes_mensuales",
        "num_empleados", "satisfacción_cliente", "duración_interacción_meses"]]
y = df["monto_ventas_futuras"]

# 2. Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Modelo de Regresión Lineal
modelo_lineal = Pipeline([
    ("scaler", StandardScaler()),
    ("reg", LinearRegression())
])
modelo_lineal.fit(X_train, y_train)
y_pred_lineal = modelo_lineal.predict(X_test)

print("Resultados - Regresión Lineal:")
print(f"MSE:  {mean_squared_error(y_test, y_pred_lineal):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lineal)):.2f}")
print(f"R²:   {r2_score(y_test, y_pred_lineal):.4f}")
scores_lineal = cross_val_score(modelo_lineal, X, y, cv=5, scoring="r2")
print(f"Validación cruzada R²: {scores_lineal.mean():.4f} ± {scores_lineal.std():.4f}\n")

# 4. Modelo de Regresión Polinómica
modelo_poly = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("reg", LinearRegression())
])
modelo_poly.fit(X_train, y_train)
y_pred_poly = modelo_poly.predict(X_test)

print("Resultados - Regresión Polinómica (Grado 2):")
print(f"MSE:  {mean_squared_error(y_test, y_pred_poly):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_poly)):.2f}")
print(f"R²:   {r2_score(y_test, y_pred_poly):.4f}")
scores_poly = cross_val_score(modelo_poly, X, y, cv=5, scoring="r2")
print(f"Validación cruzada R²: {scores_poly.mean():.4f} ± {scores_poly.std():.4f}")

# 5. Guardar modelos
joblib.dump(modelo_lineal, "../04_modelado_regresion/modelo_lineal.pkl")
joblib.dump(modelo_poly, "../04_modelado_regresion/modelo_regresion.pkl")

# 6. Comparación de resultados
print("\n=======================")
print("COMPARACIÓN DE MODELOS")
print("=======================")
comparacion = pd.DataFrame({
    "Modelo": ["Lineal", "Polinómico (Grado 2)"],
    "MSE": [mean_squared_error(y_test, y_pred_lineal),
            mean_squared_error(y_test, y_pred_poly)],
    "RMSE": [np.sqrt(mean_squared_error(y_test, y_pred_lineal)),
             np.sqrt(mean_squared_error(y_test, y_pred_poly))],
    "R2": [r2_score(y_test, y_pred_lineal),
           r2_score(y_test, y_pred_poly)],
    "CV R2 (prom)": [scores_lineal.mean(), scores_poly.mean()],
    "CV R2 (std)": [scores_lineal.std(), scores_poly.std()]
})

print(comparacion.round(4))



Resultados - Regresión Lineal:
MSE:  419863277.96
RMSE: 20490.57
R²:   0.3258
Validación cruzada R²: 0.3171 ± 0.0585

Resultados - Regresión Polinómica (Grado 2):
MSE:  372731356.94
RMSE: 19306.25
R²:   0.4015
Validación cruzada R²: 0.3685 ± 0.0573

COMPARACIÓN DE MODELOS
                 Modelo           MSE        RMSE      R2  CV R2 (prom)  \
0                Lineal  4.198633e+08  20490.5656  0.3258        0.3171   
1  Polinómico (Grado 2)  3.727314e+08  19306.2518  0.4015        0.3685   

   CV R2 (std)  
0       0.0585  
1       0.0573  
