# Preprocesamiento de los datos

In [1]:
from dotenv import load_dotenv
import os

# Cargar automáticamente las variables del archivo .env
load_dotenv()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_RAW = os.getenv('DATA_RAW')
DATA_PREPROCESSED = os.getenv('DATA_PROCESSED')

In [3]:
df = pd.read_csv(os.path.join('..',DATA_RAW, "dataframe_merged.csv"), sep=";")

In [4]:
df

Unnamed: 0,Ajuste Fecha,Dia Semana,Sabor 1,Sabor 2,Sabor 3,Sabor 4,Sabor 5,Sabor 6,Sabor 7,Sabor 8,...,humidity,wind_speed,wind_deg,wind_gust,rain_1h,clouds_all,weather_main,weather_description,weather_icon,Total Cantidad Kilos
0,2019-01-08,Sábado,,11.73,6.56,15.13,4.97,,,,...,,,,,,,,,,501.76
1,2019-01-09,Martes,33.57,42.53,33.60,5.05,,,30.75,26.07,...,,,,,,,,,,2287.92
2,2019-01-10,Jueves,11.31,6.11,5.64,5.12,5.08,,6.49,,...,,,,,,,,,,417.12
3,2019-01-11,Domingo,13.95,23.86,22.40,15.21,,,10.97,41.34,...,,,,,,,,,,1471.46
4,2019-01-12,Martes,20.64,35.50,37.23,12.02,5.13,30.46,34.81,26.19,...,,,,,,,,,,2037.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,2025-08-01,Viernes,26.60,10.97,18.73,,,6.53,14.57,4.83,...,,,,,,,,,,964.32
2029,2025-09-01,Sábado,16.21,6.12,25.69,,,12.87,14.57,10.16,...,,,,,,,,,,2123.08
2030,2025-10-01,Domingo,9.75,5.62,25.28,,,19.73,4.85,4.83,...,,,,,,,,,,1857.12
2031,2025-11-01,Lunes,16.37,10.40,32.10,,,24.95,19.37,4.86,...,,,,,,,,,,1744.75


In [14]:
df['Ajuste Fecha'] = pd.to_datetime(df['Ajuste Fecha'])
df['Dia Semana'] = df['Ajuste Fecha'].dt.day_name(locale = 'es_ES')

## Eliminar registros sin features

In [25]:
df = df[~df['dt_iso'].isna()]

In [None]:
# Eliminar dt_iso
df.drop(columns=['dt_iso'], inplace=True)

# Eliminar Total Cantidad Kilos\
df.drop(columns=['Total Cantidad Kilos'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Total Cantidad Kilos'], inplace=True)


In [42]:
df.drop('Unnamed: 81', axis=1, inplace=True)

In [46]:
df.drop('Total Cantidad', axis = 1, inplace=True)

## División del conjunto de datos

In [47]:
# Ordenar por fecha
df = df.sort_values(by='Ajuste Fecha')

In [48]:
sabores = [f'Sabor {i}' for i in range(1,80)]

X = df.drop(columns=sabores)
y = df[sabores]

In [51]:
# Reservamos los últimos 14 días para test
# En Train
X_train = X[:-14]
y_train = y[:-14]

# En test
X_test = X[-14:]
y_test = y[-14:]

## Preprocesado

In [None]:
# Varriables numericas
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Variables categóricas
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [58]:
# Eliminar columna fehca
X_train.drop(columns=['Ajuste Fecha'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['Ajuste Fecha'], inplace=True)


### Pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder

def build_preprocessing_pipeline(X_train):
    #------------ Grupos ----------------
    num_zero = ['rain_1h']
    num_mean = ['temp', 'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure',
                'visibility', 'humidity', 'wind_speed', 'wind_deg', 'wind_gust',
                'clouds_all']
    cat_mode = categorical_features


    # ----------- Sub-pipelines ---------
    zero_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', RobustScaler())

    ])

    mean_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', RobustScaler())
    ])

    mode_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # ---------- Column transformer ----------
    from sklearn.compose import ColumnTransformer

    preprocessor = ColumnTransformer(
        transformers=[
            ('num_zero', zero_pipe, num_zero),
            ('num_mean', mean_pipe, num_mean),
            ('cat_mode', mode_pipe, cat_mode)
        ],
        remainder='passthrough',  # Mantener columnas no transformadas sin prefijos
        verbose_feature_names_out=False  # No agregar prefijos a los nombres de las columnas
    )

    return preprocessor

    # ---------- Pipeline --------------
    pipeline = Pipeline([
        ('preprocessor', preprocessor)
    ])

    pipeline.fit(X_train, y_train)


In [60]:
from src.features.build_features import build_preprocessing_pipeline

preprocessor = build_preprocessing_pipeline(X_train, categorical_features)
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

pipeline.fit(X_train, y_train)


ModuleNotFoundError: No module named 'src'

In [None]:
# Guardar pipeline
import joblib
joblib.dump(pipeline, os.path.join('..',DATA_PREPROCESSED, 'pipeline.pkl'))