In [1]:
# Training Pipeline Notebook

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Cargar los datos procesados
X_train = pd.read_csv('X_train_scaled.csv')
X_test = pd.read_csv('X_test_scaled.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

# Inicializar el modelo
rf_model = RandomForestRegressor(random_state=42)

# Definir los hiperparámetros a ajustar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Realizar búsqueda en cuadrícula con validación cruzada
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                            cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_

# Hacer predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Calcular métricas de rendimiento
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mejores parámetros: {grid_search.best_params_}")
print(f"Error Cuadrático Medio de la Raíz: {rmse}")
print(f"Puntuación R-cuadrado: {r2}")

# Guardar el mejor modelo
joblib.dump(best_model, 'best_rf_model.joblib')

print("Pipeline de entrenamiento completado. Mejor modelo guardado.")

# Análisis de importancia de características
feature_importance = pd.DataFrame({
    'característica': X_train.columns,
    'importancia': best_model.feature_importances_
}).sort_values('importancia', ascending=False)

print("\nLas 10 características más importantes:")
print(feature_importance.head(10))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Root Mean Squared Error: 1.5306822902728832
R-squared Score: 0.9999639867190481
Training pipeline completed. Best model saved.

Top 10 most important features:
          feature  importance
7          Tax 5%    0.999872
5      Unit price    0.000047
13           Time    0.000014
9             Day    0.000011
12      DayOfWeek    0.000010
4    Product line    0.000009
8         Payment    0.000008
10          Month    0.000006
2   Customer type    0.000005
1            City    0.000004
