In [1]:
from dotenv import load_dotenv
import os

# Cargar automáticamente las variables del archivo .env
load_dotenv()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_RAW = os.getenv('DATA_RAW')
DATA_PREPROCESSED = os.getenv('DATA_PROCESSED')
MODELS = os.getenv('MODELS')
df = pd.read_csv(os.path.join('..',DATA_RAW, "dataframe_merged.csv"), sep=";")

df.head(5)


Unnamed: 0,Ajuste Fecha,Dia Semana,Sabor 1,Sabor 2,Sabor 3,Sabor 4,Sabor 5,Sabor 6,Sabor 7,Sabor 8,...,humidity,wind_speed,wind_deg,wind_gust,rain_1h,clouds_all,weather_main,weather_description,Estacion,EsFeriado
0,2022-01-01,Lunes,5.36,12.45,5.47,5.03,,12.41,11.6,5.6,...,64.0,3.27,86.0,4.47,0.51,40.0,Clouds,sky is clear,Verano,True
1,2022-01-02,Jueves,22.92,,26.5,,,32.5,5.08,5.12,...,80.0,2.57,60.0,,0.51,0.5,Clear,sky is clear,Verano,False
2,2022-01-03,Jueves,34.39,,35.99,,,32.64,20.1,20.37,...,67.0,2.06,150.0,0.89,3.3,0.0,Clear,sky is clear,Verano,False
3,2022-01-04,Domingo,10.66,,12.38,,,21.97,,10.21,...,69.0,4.955,198.5,2.68,1.9,0.0,Clear,sky is clear,Verano,False
4,2022-01-05,Martes,11.42,,23.53,,,18.89,5.05,5.07,...,53.5,4.885,125.0,3.13,,0.0,Clear,sky is clear,Verano,False


In [2]:
df_features = df.copy()
df_features.drop([f'Sabor {i}' for i in range(1,80)], axis = 1, inplace = True)

In [3]:
df_target = df.loc[:, df.columns.str.startswith("Sabor")].copy()
df_target.head()

Unnamed: 0,Sabor 1,Sabor 2,Sabor 3,Sabor 4,Sabor 5,Sabor 6,Sabor 7,Sabor 8,Sabor 9,Sabor 10,...,Sabor 70,Sabor 71,Sabor 72,Sabor 73,Sabor 74,Sabor 75,Sabor 76,Sabor 77,Sabor 78,Sabor 79
0,5.36,12.45,5.47,5.03,,12.41,11.6,5.6,6.22,11.21,...,,,,,,,,,,
1,22.92,,26.5,,,32.5,5.08,5.12,,25.26,...,,1.07,,37.09,43.6,,20.0,50.0,28.03,11.63
2,34.39,,35.99,,,32.64,20.1,20.37,,49.79,...,,3.17,3.29,30.47,37.01,,20.0,50.0,25.95,10.61
3,10.66,,12.38,,,21.97,,10.21,,22.96,...,,,1.02,11.89,11.54,,10.0,10.0,11.3,5.59
4,11.42,,23.53,,,18.89,5.05,5.07,,26.02,...,,1.04,3.06,18.13,25.47,,60.0,70.0,12.66,11.94


In [4]:
df_features = df_features.rename(
        columns={
            'Ajuste Fecha': 'date',
            'EsFeriado':'holiday',
            'Estacion':'season'
            }
    )

In [5]:
df_features = df_features[['date', 'temp', 'humidity','wind_speed','clouds_all','holiday','season']]

In [6]:
df_features['date'] = pd.to_datetime(df_features['date'])

In [7]:
df_features.head()

Unnamed: 0,date,temp,humidity,wind_speed,clouds_all,holiday,season
0,2022-01-01,25.9504,64.0,3.27,40.0,True,Verano
1,2022-01-02,25.956667,80.0,2.57,0.5,False,Verano
2,2022-01-03,28.2904,67.0,2.06,0.0,False,Verano
3,2022-01-04,26.944615,69.0,4.955,0.0,False,Verano
4,2022-01-05,24.38125,53.5,4.885,0.0,False,Verano


------------

In [8]:
import joblib

Entrenar Encoder de Season

In [9]:
from sklearn.preprocessing import OrdinalEncoder

season_order = [['Verano', 'Otoño', 'Invierno', 'Primavera']]
ordinalEncoder = OrdinalEncoder(categories=season_order, dtype="int8")
ordinalEncoder.fit(df_features[['season']])

In [10]:
joblib.dump(ordinalEncoder, "../src/features/ordinalEncoderSeason.joblib")

['../src/features/ordinalEncoderSeason.joblib']

In [11]:
df_features.drop('season', axis = 1, inplace = True)
df_features.head()

Unnamed: 0,date,temp,humidity,wind_speed,clouds_all,holiday
0,2022-01-01,25.9504,64.0,3.27,40.0,True
1,2022-01-02,25.956667,80.0,2.57,0.5,False
2,2022-01-03,28.2904,67.0,2.06,0.0,False
3,2022-01-04,26.944615,69.0,4.955,0.0,False
4,2022-01-05,24.38125,53.5,4.885,0.0,False


Ahora el dataset de features esta tal cual estara el input del usuario

Entrenar escalado

In [12]:
df_features_sin_fecha = df_features.drop('date', axis = 1, inplace = False)

In [13]:
df_features_sin_fecha.head(2)

Unnamed: 0,temp,humidity,wind_speed,clouds_all,holiday
0,25.9504,64.0,3.27,40.0,True
1,25.956667,80.0,2.57,0.5,False


In [14]:
from sklearn.preprocessing import MinMaxScaler

minMaxScaler = MinMaxScaler()

num_cols = df_features_sin_fecha.select_dtypes(include='number').columns

minMaxScaler.fit(df_features_sin_fecha[num_cols])

In [15]:
joblib.dump(
    {"scaler": minMaxScaler,
     "columns": num_cols.to_list()},
     "../src/features/minmaxScaler.joblib"
)

['../src/features/minmaxScaler.joblib']

------------

Pipeline

In [16]:
import sys
from pathlib import Path

#Añadír la carpeta src al PYTHONPATH de la sesión 
ROOT = Path.cwd().parent        # .. = carpeta raíz del proyecto
sys.path.append(str(ROOT / "src"))

# Importar mis clases
from features.my_transformers import LagRoller
from features.my_transformers import CyclicalEncoder, InformationOfDateExtractor, SeasonGetter, SeasonOrdinalizer

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
ordinalEncoder = joblib.load("../src/features/ordinalEncoderSeason.joblib")
minMaxScaler = joblib.load("../src/features/minmaxScaler.joblib")
scaler_numerico = minMaxScaler["scaler"]
num_cols = df_features.select_dtypes(include='number').columns

In [19]:
preprocessing = ColumnTransformer(
    transformers = [
        #("oridnalEncoderSeason", ordinalEncoder, ["season"]),
        ("numeric_columns", scaler_numerico, num_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)

In [21]:
pipeline = Pipeline([
    ("season_getter", SeasonGetter(date_column_name="date")),
    ("date_info", InformationOfDateExtractor(date_column_name="date")),
    ("season_ordinalizer", SeasonOrdinalizer(col="season")),
    ("cyclic_encoding", CyclicalEncoder(
        cyc_cols = {"month":12, "season": 4, "day":31},
        drop = True
    )),
    ("encode_and_scale", preprocessing)
])

In [22]:
df_features.head()

Unnamed: 0,date,temp,humidity,wind_speed,clouds_all,holiday
0,2022-01-01,25.9504,64.0,3.27,40.0,True
1,2022-01-02,25.956667,80.0,2.57,0.5,False
2,2022-01-03,28.2904,67.0,2.06,0.0,False
3,2022-01-04,26.944615,69.0,4.955,0.0,False
4,2022-01-05,24.38125,53.5,4.885,0.0,False


In [23]:
pipeline.fit_transform(df_features)

array([[0.7441461352304866, 0.4857142857142857, 0.21288014311270126, ...,
        1.0, 0.20129852008866006, 0.9795299412524945],
       [0.7443653963903549, 0.7142857142857142, 0.15026833631484793, ...,
        1.0, 0.39435585511331855, 0.9189578116202306],
       [0.8260191853514884, 0.5285714285714285, 0.10465116279069768, ...,
        1.0, 0.5712682150947923, 0.8207634412072763],
       ...,
       [0.6817214333615185, 0.32142857142857145, 0.32021466905187834,
        ..., 1.0, -0.3943558551133187, 0.9189578116202306],
       [0.7326879902032248, 0.42142857142857143, 0.36046511627906974,
        ..., 1.0, -0.20129852008866114, 0.9795299412524943],
       [0.8010467387818176, 0.3357142857142857, 0.28130590339892664, ...,
        1.0, -2.4492935982947064e-16, 1.0]], dtype=object)

In [24]:
pipeline.get_feature_names_out()

array(['temp', 'humidity', 'wind_speed', 'clouds_all', 'holiday',
       'month_sin', 'month_cos', 'season_sin', 'season_cos', 'day_sin',
       'day_cos'], dtype=object)

Guardar pipeline

In [25]:
joblib.dump(pipeline, "../src/features/features_pipeline.joblib")

['../src/features/features_pipeline.joblib']