In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import funciones as fn

In [2]:
# Rutas y nombre de la columna objetivo
train_path = "dataset/train.csv"
val_path = "dataset/val.csv"
test_path = "dataset/test.csv"
target_column = "shares"

# Cargar datos
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)

In [3]:
# Columnas especificadas manualmente vía funciones
numeric_cols = fn.columnas_numericas()
ohe_cols = fn.columnas_one_hot()
use_imputer_for_ohe = False  # True si las OHE tienen NaNs

In [4]:
# Validar existencia de columnas
missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_train.columns]
if missing:
    raise ValueError(f"Faltan columnas en `train`: {missing}")
missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_val.columns]
if missing:
    raise ValueError(f"Faltan columnas en `val`: {missing}")

In [5]:

# Separar X / y
X_train = df_train.drop(columns=[target_column])
y_train = df_train[target_column]
X_val = df_val.drop(columns=[target_column])
y_val = df_val[target_column]
# Log-transform de la variable objetivo
# y_train_log = np.log1p(y_train)
# y_val_log = np.log1p(y_val)
# y_test_log = np.log1p(y_test)

In [6]:
# Construir transformadores
transformers = []
if numeric_cols:
    num_pipeline = Pipeline([('scaler', StandardScaler())])
    transformers.append(('num', num_pipeline, numeric_cols))

if ohe_cols:
    if use_imputer_for_ohe:
        from sklearn.impute import SimpleImputer
        ohe_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent'))])
        transformers.append(('ohe_given', ohe_pipeline, ohe_cols))
    else:
        transformers.append(('ohe_given', 'passthrough', ohe_cols))

if not transformers:
    raise ValueError('No se definieron columnas numéricas ni OHE.')

preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

# Pipeline con Decision Tree Regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', DecisionTreeRegressor(random_state=42))
])

# Entrenar con log-target
pipeline.fit(X_train, y_train)

# Función de evaluación (se invierte log1p con expm1)
def evaluar(nombre, X, y_true):
    y_pred = pipeline.predict(X)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {nombre} ---")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")

evaluar('validation', X_val, y_val)

--- validation ---
MSE: 278231791.3336
RMSE: 16680.2815
MAE: 4371.7020
R2: -1.0802
