In [1]:
# =========================================================
# 🔌 Cargar entorno y librerías
# =========================================================
%load_ext kedro.ipython

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# =========================================================
# 1️⃣ Cargar dataset desde Kedro
# =========================================================
df = catalog.load("Features_training_v2").copy()
print("Shape del dataset:", df.shape)

# =========================================================
# 2️⃣ Seleccionar variables según la hipótesis
# =========================================================
features = [
    "Monetary",
    "TimeSinceLastTxn",
    "Recency",
    "TxnCountInLast24Hours",
    "AmountZScoreByLocation"
]
target = "TransactionAmount (INR)"

df_model = df[features + [target]].dropna()
X = df_model[features]
y = df_model[target]

Shape del dataset: (984240, 6)


In [2]:
# =========================================================
# 3️⃣ División Train/Test
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================================================
# 4️⃣ Definir modelo Random Forest + GridSearchCV
# =========================================================
rf = RandomForestRegressor(random_state=42)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_rf = GridSearchCV(
    rf,
    param_grid=param_grid,
    cv=cv,
    scoring="r2",
    n_jobs=-1
)

# =========================================================
# 5️⃣ Entrenamiento con validación cruzada
# =========================================================
grid_rf.fit(X_train, y_train)

print("Mejores parámetros:", grid_rf.best_params_)
print("R² medio (CV):", grid_rf.best_score_)

# =========================================================
# 6️⃣ Evaluación final
# =========================================================
best_rf = grid_rf.best_estimator_

y_pred = best_rf.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("\n===== Métricas del Modelo Random Forest =====")
print(f"R²: {r2:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")

# =========================================================
# 7️⃣ Gráficos de evaluación
# =========================================================

# 🎯 a) Valores reales vs predichos
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color='red', linestyle='--', label='Línea ideal')
plt.title("Valores Reales vs Predichos – Random Forest")
plt.xlabel("Valores reales (y_test)")
plt.ylabel("Predicciones (y_pred)")
plt.legend()
plt.grid(True)
plt.show()

# 🎯 b) Distribución de residuos
residuals = y_test - y_pred
plt.figure(figsize=(6,5))
sns.histplot(residuals, bins=30, kde=True)
plt.title("Distribución de Residuos – Random Forest")
plt.xlabel("Error (y_real - y_pred)")
plt.ylabel("Frecuencia")
plt.grid(True)
plt.show()

# 🎯 c) Importancia de variables
importances = best_rf.feature_importances_
imp_df = pd.DataFrame({"Variable": features, "Importancia": importances})
imp_df = imp_df.sort_values(by="Importancia", ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(data=imp_df, x="Importancia", y="Variable", palette="Greens_r")
plt.title("Importancia de Variables – Random Forest")
plt.xlabel("Importancia relativa")
plt.ylabel("")
plt.grid(True)
plt.show()

print("\nImportancia de variables:")
print(imp_df)
