In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

In [11]:
# pipeline_prueba.ipynb  (está en notebooks/)
import sys
from pathlib import Path

# 1) Añadí la carpeta src al PYTHONPATH de la sesión ─────────
ROOT = Path.cwd().parent        # .. = carpeta raíz del proyecto
sys.path.append(str(ROOT / "src"))

# 2) Ahora podés importar la clase
from features.my_transformers import LagRoller
from features.my_transformers import CyclicalEncoder


In [3]:
# columnas finales después del paso cíclico
final_cols = [
    'temp', 'humidity', 'wind_speed', 'clouds_all',       # num
    'EsFeriado', 'day',                                   # binaria / ordinal
    'Dia Semana_sin', 'Dia Semana_cos',                   # cíclicas
    'month_sin', 'month_cos',
    'Estacion_sin', 'Estacion_cos'
]

num_cols    = ['temp', 'humidity', 'wind_speed', 'clouds_all', 'day']
bin_cols    = ['EsFeriado']
cyc_cols    = [c for c in final_cols if c.endswith(('_sin', '_cos'))]

preproc = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),   # o StandardScaler()
        ('bin', 'passthrough', bin_cols),
        ('cyc', 'passthrough', cyc_cols)
    ],
    remainder='drop'
)


In [4]:
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

cyc_map = {'Dia Semana': 7, 'month': 12, 'Estacion': 4}

xgb = MultiOutputRegressor(
        XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=5,
            objective='reg:squarederror',
            tree_method='hist',
            random_state=42
        ),
        n_jobs=-1
)

pipe = Pipeline(steps=[
    ('cyc',  CyclicalEncoder(cyc_cols=cyc_map, drop=True)),
    ('prep', preproc),
    ('model', xgb)
])


Pruebas

In [5]:
from dotenv import load_dotenv
import os

# Cargar automáticamente las variables del archivo .env
load_dotenv()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_RAW = os.getenv('DATA_RAW')
DATA_PREPROCESSED = os.getenv('DATA_PROCESSED')
MODELS = os.getenv('MODELS')
# df_train = pd.read_csv(os.path.join('..',DATA_PREPROCESSED, "dataframe_train.csv"), sep=";")
# df_test = pd.read_csv(os.path.join('..',DATA_PREPROCESSED, "dataframe_test.csv"), sep=";")
df = pd.read_csv(os.path.join('..',DATA_RAW, "dataframe_merged.csv"), sep=";")

#Se comprobó previamente que ciertas columans no serán útiles:
df.drop(
    ['dew_point',
'feels_like',
'temp_min',
'temp_max',
'weather_description',
'weather_main',
'Total Cantidad',
'Unnamed: 81',
'dt_iso'], axis= 1, inplace=True
)
y = df[[f'Sabor {i}' for i in range(1, 80)]]
X = df.drop([f'Sabor {i}' for i in range(1, 80)], axis = 1)
#Agregamos algunas columnas de fecha que podrían tener información sobre tendencias

X['Ajuste Fecha'] = pd.to_datetime(X['Ajuste Fecha'])
X['year'] = X['Ajuste Fecha'].dt.year
X['month'] = X['Ajuste Fecha'].dt.month
X['day'] = X['Ajuste Fecha'].dt.day
# Número de semana ISO (1-53) — lunes es el primer día de la semana
X['week'] = X['Ajuste Fecha'].dt.isocalendar().week      # tipo UInt32
X['week'] = X['week'].astype(int)                        # opcional, pasarlo a int

#Eliminar Fecha
X.drop('Ajuste Fecha', axis = 1, inplace=True)

In [6]:
X_train = X.iloc[:X.shape[0]-14,:]
X_test = X.iloc[X.shape[0]-14:,:]

y_train = y.iloc[:y.shape[0]-14,:]
y_test = y.iloc[y.shape[0]-14:,:]

In [9]:
X_train.fillna(0, inplace = True)
X_test.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
y_test.fillna(0, inplace= True)
from sklearn.preprocessing import MinMaxScaler

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

escalador = MinMaxScaler(feature_range=(0,1))
X_train_scaled = X_train.copy()
X_train_scaled[num_cols] = escalador.fit_transform(X_train[num_cols])

#Ahora con test
X_test_scaled = X_test.copy()
X_test_scaled[num_cols] = escalador.transform(X_test[num_cols])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.fillna(0, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.fillna(0, inplace= True)


In [17]:
cyc_map = {'week': 7, 'month': 12, 'Estacion': 4}
cyclical_encoder = CyclicalEncoder(cyc_cols = cyc_map, drop = True)

In [21]:
est_map = {'Verano': 0, 'Otoño': 1, 'Invierno': 2, 'Primavera': 3}
X_train_scaled['Estacion'] = X_train_scaled['Estacion'].map(est_map).astype(int)

In [25]:
cyclical_encoder.transform(X_train_scaled)

Unnamed: 0,Dia Semana,temp,pressure,humidity,wind_speed,wind_deg,wind_gust,rain_1h,clouds_all,EsFeriado,year,day,week_sin,week_cos,month_sin,month_cos,Estacion_sin,Estacion_cos
0,Lunes,0.744146,0.116111,0.485714,0.212880,0.216617,0.153820,0.051463,0.400,True,2022,1,0.433884,-0.900969,5.000000e-01,0.866025,0.0,1.000000e+00
1,Jueves,0.744365,0.167824,0.714286,0.150268,0.139466,0.000000,0.051463,0.005,False,2022,2,0.433884,-0.900969,5.000000e-01,0.866025,0.0,1.000000e+00
2,Jueves,0.826019,0.229444,0.528571,0.104651,0.406528,0.030626,0.332997,0.000,False,2022,3,0.781831,0.623490,5.000000e-01,0.866025,0.0,1.000000e+00
3,Domingo,0.778932,0.212607,0.557143,0.363596,0.550445,0.092223,0.191726,0.000,False,2022,4,0.781831,0.623490,5.000000e-01,0.866025,0.0,1.000000e+00
4,Martes,0.689244,0.423611,0.335714,0.357335,0.332344,0.107708,0.000000,0.000,False,2022,5,0.781831,0.623490,5.000000e-01,0.866025,0.0,1.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,Domingo,0.699230,0.271991,0.314286,0.520572,0.495549,0.000000,0.073663,0.000,False,2024,13,0.781831,0.623490,-2.449294e-16,1.000000,-1.0,-1.836970e-16
1078,Lunes,0.687203,0.337963,0.314286,0.288909,0.495549,0.000000,0.000000,0.000,False,2024,14,0.781831,0.623490,-2.449294e-16,1.000000,-1.0,-1.836970e-16
1079,Martes,0.672012,0.420139,0.471429,0.288909,0.465875,0.000000,0.000000,0.000,False,2024,15,0.781831,0.623490,-2.449294e-16,1.000000,-1.0,-1.836970e-16
1080,Miércoles,0.592967,0.478009,0.328571,0.600179,0.495549,0.000000,0.000000,0.000,False,2024,16,0.974928,-0.222521,-2.449294e-16,1.000000,-1.0,-1.836970e-16


In [None]:
_

In [7]:
pipe.fit(X_train, y_train)          # aprende escalas + modelo
y_pred = pipe.predict(X_test)        # todo en una línea

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
pipe.fit(X_train, y_train)          # aprende escalas + modelo
y_pred = pipe.predict(X_val)        # todo en una línea

# Guardado
import joblib
joblib.dump(pipe, 'pipeline_sabor1.pkl')

# Recarga en producción
pipe_loaded = joblib.load('pipeline_sabor1.pkl')
y_future = pipe_loaded.predict(X_future)     # mismo DataFrame “crudo”


In [5]:
#ColumnTransformer para escalar solo algunos grupos
numeric_cols = [...]   # lista de columnas numéricas tras LagRoller
ct = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_cols),
            # ('cat', OneHotEncoder(), cat_cols)  # si hubiera
        ],
        remainder='passthrough'
)

xgb = MultiOutputRegressor(
        XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            objective='reg:squarederror',
            tree_method='hist',
            random_state=42
        ),
        n_jobs=-1
)

#Pipeline
pipe = Pipeline(steps=[
    ('lagger', LagRoller(lags=90, horizon=14, roll_windows=(7,))),
    ('preproc', ct),
    ('model', xgb)
])

Mi modelo debe recibir las columnas
['Dia Semana', 'temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
       'wind_gust', 'rain_1h', 'clouds_all', 'Estacion', 'EsFeriado', 'year',
       'month', 'day', 'week']

para luego hacer una codificacion ciclica y obtener:

'temp', 'humidity', 'wind_speed', 'clouds_all', 'EsFeriado', 'day','dia_sin', 'dia_cos', 'month_sin', 'month_cos', 'est_sin', 'est_cos'

Cómo puedo plantear esto en el pipeline

In [None]:
from dotenv import load_dotenv
import os

# Cargar automáticamente las variables del archivo .env
load_dotenv()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_RAW = os.getenv('DATA_RAW')
DATA_PREPROCESSED = os.getenv('DATA_PROCESSED')
MODELS = os.getenv('MODELS')
# df_train = pd.read_csv(os.path.join('..',DATA_PREPROCESSED, "dataframe_train.csv"), sep=";")
# df_test = pd.read_csv(os.path.join('..',DATA_PREPROCESSED, "dataframe_test.csv"), sep=";")
df = pd.read_csv(os.path.join('..',DATA_RAW, "dataframe_merged.csv"), sep=";")

In [None]:
pipe.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error
pred = pipe.predict(X_val)
print("MAE:", mean_absolute_error(y_val, pred))

import joblib
joblib.dump(pipe, 'pipeline_sabor1.pkl')   # guarda TODO
