In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import funciones as fn

cargamos los csv de train, val y test

In [6]:
# Rutas y nombre de la columna objetivo (editar según corresponda)
train_path = "dataset/train.csv"
val_path = "dataset/val.csv"
target_column = "shares"

# Cargar datos
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)

Taemos las columnas a usar

In [7]:
# --- Especifica manualmente las columnas ---
columnas_one_hot = fn.columnas_one_hot()
columnas_numericas = fn.columnas_numericas()
numeric_cols = columnas_numericas     # <-- reemplazar por tus columnas numéricas
ohe_cols = columnas_one_hot # <-- reemplazar por tus columnas 0/1 ya one-hot
use_imputer_for_ohe = False                  # True si las OHE pueden tener NaNs y quieres imputarlas

Validar que las columnas existan en los datasets

In [8]:
# Validar existencia de columnas
missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_train.columns]
if missing:
    raise ValueError(f"Faltan columnas en `train.csv`: {missing}")

missing = [c for c in numeric_cols + ohe_cols + [target_column] if c not in df_val.columns]
if missing:
    raise ValueError(f"Faltan columnas en `val.csv`: {missing}")

Separamos las características y la variable objetivo

In [9]:
X_train = df_train.drop(columns=[target_column])
y_train = df_train[target_column]
X_val = df_val.drop(columns=[target_column])
y_val = df_val[target_column]

Transofrmacion logaritmica de variable objetivo

In [10]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

Entrenamiento del modelo y validacion

In [11]:
# Construir transformadores (sin categorías)
transformers = []
if numeric_cols:
    num_pipeline = Pipeline([('scaler', StandardScaler())])
    transformers.append(('num', num_pipeline, numeric_cols))

if ohe_cols:
    transformers.append(('ohe_given', 'passthrough', ohe_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
# ])

# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('reg', Ridge(alpha=1.0))
# ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=10000, random_state=42))
])

# Entrenar con train.csv
pipeline.fit(X_train, y_train_log)

def evaluar(nombre, X, y_true):
    y_pred_log = pipeline.predict(X)
    y_pred = np.expm1(y_pred_log)

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {nombre} ---")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")

# Evaluar en `dataset/val.csv` y `dataset/test.csv`
evaluar('validation', X_val, y_val)

--- validation ---
MSE: 136185433.6605
RMSE: 11669.8515
MAE: 2407.6442
R2: -0.0182
