# Notebook 2 — Ingeniería de Características (Feature Engineering)

Incluye: Carga de Datos, Funciones de Feature Engineering y Pipeline + Guardado.

## Celda de Carga de Datos (Ajusta rutas)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# === Celda de Carga de Datos (Ajusta rutas) ===
import pandas as pd

# Ajusta esta ruta a tu entorno. Opciones sugeridas más abajo.
DATA_PATH = r"""/content/drive/MyDrive/Colab Notebooks/SEXTO TRIMESTRE/cleaned_modified.csv"""  # <-- AJUSTAR

# Alternativas de ruta:
# DATA_PATH = "data/cleaned_modified.csv"
# DATA_PATH = "../data/cleaned_modified.csv"

df = pd.read_csv(DATA_PATH)
print("Loaded shape:", df.shape)
df.head()


Loaded shape: (8750, 18)


Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,year,month,dayofweek,is_weekend,day
0,339.0,10,-3.5,24,1.2,1996,-21.2,0.65,0.0,0.0,Winter,No Holiday,Yes,2017,12,4,0,1
1,360.0,11,-0.5,21,1.3,1936,-20.2,0.94,0.0,0.0,Winter,No Holiday,Yes,2017,12,4,0,1
2,449.0,12,1.7,23,1.4,2000,-17.2,1.11,0.0,0.0,Winter,No Holiday,Yes,2017,12,4,0,1
3,451.0,13,2.4,25,1.6,2000,-15.6,1.16,0.0,0.0,Winter,No Holiday,Yes,2017,12,4,0,1
4,447.0,14,3.0,26,2.0,2000,-14.6,1.01,0.0,0.0,Winter,No Holiday,Yes,2017,12,4,0,1


## Celda de Funciones de Feature Engineering

In [4]:
# === Celda de Funciones de Feature Engineering ===
import numpy as np
import pandas as pd

def parse_dates(df, date_col="Date"):
    """Convierte Date a datetime y crea: year, month, day, dayofweek, is_weekend."""
    if date_col in df.columns:
        df = df.copy()
        if not np.issubdtype(df[date_col].dtype, np.datetime64):
            df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
        df["year"] = df[date_col].dt.year
        df["month"] = df[date_col].dt.month
        df["day"] = df[date_col].dt.day
        df["dayofweek"] = df[date_col].dt.dayofweek
        df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)
    return df

def add_cyclic_hour(df, hour_col="Hour"):
    """Crea codificación cíclica para la hora: sin/cos."""
    if hour_col in df.columns:
        df = df.copy()
        df["hour_sin"] = np.sin(2 * np.pi * df[hour_col] / 24.0)
        df["hour_cos"] = np.cos(2 * np.pi * df[hour_col] / 24.0)
    return df

def clean_categories(df, cats=None):
    """Asegura tipo category para columnas categóricas típicas del dataset."""
    if cats is None:
        cats = ["Seasons", "Holiday", "Functioning Day"]
    df = df.copy()
    for c in cats:
        if c in df.columns:
            df[c] = df[c].astype("category")
    return df

def make_lags(df, target_col="Rented Bike Count", lags=(1,24)):
    """Crea lags del target si existe (no imprescindibles para EDA)."""
    if target_col in df.columns:
        df = df.copy()
        for l in lags:
            df[f"{target_col}_lag{l}"] = df[target_col].shift(l)
    return df

def build_feature_table(df, target_col="Rented Bike Count"):
    """Aplica todas las transformaciones y separa X, y si existe target."""
    df = parse_dates(df)
    df = add_cyclic_hour(df)
    df = clean_categories(df)
    df = make_lags(df, target_col=target_col, lags=(1, 24))

    y = None
    if target_col in df.columns:
        y = df[target_col].copy()

    # Lista de columnas a excluir (identificadores/fechas crudas/target)
    drop_cols = []
    for c in ["Date", target_col]:
        if c in df.columns:
            drop_cols.append(c)

    X = df.drop(columns=drop_cols, errors="ignore")
    return X, y

# Prueba rápida (no guarda nada todavía)
# X, y = build_feature_table(df)
# print(X.shape, None if y is None else y.shape)


## Celda de Pipeline y Guardado (Ajusta rutas)

In [5]:
# === Celda de Pipeline y Guardado (Ajusta rutas) ===
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted
import joblib
import os

# 1) Construir tabla de características
TARGET = "Rented Bike Count"
X, y = build_feature_table(df, target_col=TARGET)

# 2) Definir tipos de columnas
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["category", "object"]).columns.tolist()

# 3) Preprocesadores
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# 4) Ajustar y transformar
X_processed = preprocessor.fit_transform(X)

# 5) Reconstruir DataFrame con nombres de columnas
feature_names = []
if num_cols:
    feature_names.extend(num_cols)
if cat_cols:
    # nombres de OHE
    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
    feature_names.extend(ohe_names)

X_proc_df = pd.DataFrame(X_processed, columns=feature_names, index=X.index)

print("Preprocessed shape:", X_proc_df.shape)

# 6) Guardado (ajusta rutas)
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)

OUT_DATA_PATH = "data/cleaned_enriched_lags.csv"   # <-- AJUSTAR si lo deseas
PIPE_PATH = "models/fe_preprocessor.joblib"        # <-- AJUSTAR si lo deseas

# Unir X procesado con y si existe
if y is not None:
    out_df = X_proc_df.copy()
    out_df[TARGET] = y.values
else:
    out_df = X_proc_df

out_df.to_csv(OUT_DATA_PATH, index=False)
joblib.dump(preprocessor, PIPE_PATH)

print(f"Guardado dataset procesado en: {OUT_DATA_PATH}")
print(f"Guardado preprocesador en: {PIPE_PATH}")


Preprocessed shape: (8750, 26)
Guardado dataset procesado en: data/cleaned_enriched_lags.csv
Guardado preprocesador en: models/fe_preprocessor.joblib
