In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
from sklearn.metrics import r2_score

In [16]:

data = pd.read_csv(r"Inputs/dataset_alpha_betha.csv")
# Reemplazar valores ' '
data.replace(' ', 0, inplace=True)

# Preprocesamiento (preprocesamiento básico incluye el manejo de columnas numéricas y categóricas)
# Identificar las columnas numéricas y categóricas
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# Excluir 'autoID' y 'Class' que no son relevantes para la predicción
numerical_cols = [col for col in numerical_cols if col not in ['autoID','Demand', 'Class']]
categorical_cols = [col for col in categorical_cols if col not in ['autoID','Demand', 'Class']]


In [17]:

# Preprocesador y Pipeline para diferentes regresores

# 1. RandomForestRegressor
pipeline_rf = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])),
    ('regressor', RandomForestRegressor(random_state=42))
])

# 2. LinearRegression
pipeline_lr = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])),
    ('regressor', LinearRegression())
])

# 3. GradientBoostingRegressor
pipeline_gb = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# 4. XGBRegressor
pipeline_xgb = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])),
    ('regressor', XGBRegressor(random_state=42))
])

# Dividir el conjunto de datos
X = data.drop(columns=['autoID', 'Demand', 'Class'])  # Features (sin la columna objetivo)
y = data['Demand']  # Target variable (Demanda)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar y evaluar cada modelo

# 1. RandomForestRegressor
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
r2 = r2_score(y_test, y_pred_rf)
print("Random Forest Regressor")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_rf)}")
print(f"R²: {r2}\n")

# 2. LinearRegression
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
r2 = r2_score(y_test, y_pred_lr)
print("Linear Regression")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lr)}")
print(f"R²: {r2}\n")

# 3. GradientBoostingRegressor
pipeline_gb.fit(X_train, y_train)
y_pred_gb = pipeline_gb.predict(X_test)
print("Gradient Boosting Regressor")
r2 = r2_score(y_test, y_pred_gb)
print(f"MAE: {mean_absolute_error(y_test, y_pred_gb)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_gb)}")
print(f"R²: {r2}\n")

# 4. XGBRegressor
pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_test)
r2 = r2_score(y_test, y_pred_xgb)
print("XGB Regressor")
print(f"MAE: {mean_absolute_error(y_test, y_pred_xgb)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_xgb)}")
print(f"R²: {r2}\n")

Random Forest Regressor
MAE: 771.133796170923
MSE: 1193464.7737204763
R²: 0.7705939259313704

Linear Regression
MAE: 891.5398243435059
MSE: 1269184.7630785352
R²: 0.7560391390037262

Gradient Boosting Regressor
MAE: 726.1376992628691
MSE: 1033287.1443245846
R²: 0.801383038373107

XGB Regressor
MAE: 762.1840422506447
MSE: 1203593.2284850655
R²: 0.7686470150947571

