In [1]:
# ===================================================================
# PASO 1: Cargar los datos originales
# ===================================================================
import pandas as pd
import numpy as np

model_input_path = "../data/model_input.parquet"
df_features = pd.read_parquet(model_input_path)
df_features = df_features.sort_values(by=['bus_route', 'hour_timestamp'])


# ===================================================================
# PASO 2: Crear características que NO generan NaNs
# ===================================================================

# --- Características Cíclicas ---
df_features['day_of_week_sin'] = np.sin(2 * np.pi * df_features['day_of_week'] / 7)
df_features['day_of_week_cos'] = np.cos(2 * np.pi * df_features['day_of_week'] / 7)
df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24)
df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24)
df_features['month_sin'] = np.sin(2 * np.pi * df_features['month'] / 12)
df_features['month_cos'] = np.cos(2 * np.pi * df_features['month'] / 12)

# --- Características de Clima Refinadas ---
bin_edges = [-1, 0, 1, 5, float('inf')]
bin_labels = ['0_no_rain', '1_light_rain', '2_moderate_rain', '3_heavy_rain']
df_features['precipitation_category'] = pd.cut(df_features['precipitation'], bins=bin_edges, labels=bin_labels)
df_features['temp_wind_interaction'] = df_features['temperature_2m'] / (df_features['wind_speed_10m'] + 1)

# --- Características de Festivos ---
holidays = df_features[df_features['is_holiday']].hour_timestamp
ts = pd.Series(index=df_features.index, dtype='datetime64[ns]')
ts.loc[holidays.index] = holidays.values
last_holiday = ts.ffill()
next_holiday = ts.bfill()
df_features['days_since_last_holiday'] = (df_features['hour_timestamp'] - last_holiday).dt.days
df_features['days_until_next_holiday'] = (next_holiday - df_features['hour_timestamp']).dt.days
df_features['days_since_last_holiday'] = df_features['days_since_last_holiday'].fillna(999)
df_features['days_until_next_holiday'] = df_features['days_until_next_holiday'].fillna(999)


# ===================================================================
# PASO 3: Crear características que SÍ generan NaNs
# ===================================================================

# --- Lags ---
df_features['ridership_lag_1h'] = df_features.groupby('bus_route', observed=False)['ridership_total'].shift(1)
df_features['ridership_lag_24h'] = df_features.groupby('bus_route', observed=False)['ridership_total'].shift(24)
df_features['ridership_lag_1w'] = df_features.groupby('bus_route', observed=False)['ridership_total'].shift(24*7)

# --- Ventana Móvil ---
WINDOW_SIZE_7D = 24 * 7
df_features['ridership_roll_mean_7d'] = df_features.groupby('bus_route', observed=False)['ridership_total'].rolling(window=WINDOW_SIZE_7D).mean().reset_index(level=0, drop=True)


# ===================================================================
# PASO 4: Limpiar todos los NaNs de una sola vez
# ===================================================================
print(f"Filas antes de la limpieza final: {len(df_features)}")
df_features = df_features.dropna()
print(f"Filas después de la limpieza final: {len(df_features)}")


# ===================================================================
# PASO 5: Comprobar el resultado
# ===================================================================
# Ahora el resultado para los festivos debería ser correcto
print("\nComprobación de las características de festivos (resultado corregido):")
print(df_features[['hour_timestamp', 'is_holiday', 'days_since_last_holiday', 'days_until_next_holiday']].head())

Filas antes de la limpieza final: 19632
Filas después de la limpieza final: 19128

Comprobación de las características de festivos (resultado corregido):
         hour_timestamp  is_holiday  days_since_last_holiday  \
168 2023-01-08 00:00:00       False                        5   
169 2023-01-08 01:00:00       False                        5   
170 2023-01-08 02:00:00       False                        5   
171 2023-01-08 03:00:00       False                        5   
172 2023-01-08 04:00:00       False                        5   

     days_until_next_holiday  
168                      8.0  
169                      7.0  
170                      7.0  
171                      7.0  
172                      7.0  


In [2]:
# Limpieza de dataFrame y tipación de datos.

# Lista de columnas a eliminar
columns_to_drop = [
    'hour_timestamp', 'operating_buses', 'transfers_total', 'hour',
    'day_of_week', 'month', 'year', 'precipitation', 'rain', 'snowfall'
]

# Creamos el DataFrame final para el modelo
df_model = df_features.drop(columns=columns_to_drop)

print("DataFrame después de eliminar columnas redundantes:")
df_model.info()

DataFrame después de eliminar columnas redundantes:
<class 'pandas.core.frame.DataFrame'>
Index: 19128 entries, 168 to 19631
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   bus_route                19128 non-null  category
 1   ridership_total          19128 non-null  uint32  
 2   is_weekend               19128 non-null  bool    
 3   is_holiday               19128 non-null  bool    
 4   temperature_2m           19128 non-null  float32 
 5   relative_humidity_2m     19128 non-null  float32 
 6   weather_code             19128 non-null  uint8   
 7   wind_speed_10m           19128 non-null  float32 
 8   day_of_week_sin          19128 non-null  float64 
 9   day_of_week_cos          19128 non-null  float64 
 10  hour_sin                 19128 non-null  float64 
 11  hour_cos                 19128 non-null  float64 
 12  month_sin                19128 non-null  float64 
 13  month_cos   

In [4]:
# --- Optimización Final de Tipos de Datos ---

# Definimos el esquema de tipos de datos óptimo
final_schema = {
    'ridership_total': 'uint32',
    'is_weekend': 'uint8',
    'is_holiday': 'uint8',
    'temperature_2m': 'float32',
    'relative_humidity_2m': 'float32',
    'weather_code': 'uint8',
    'wind_speed_10m': 'float32',
    'day_of_week_sin': 'float32',
    'day_of_week_cos': 'float32',
    'hour_sin': 'float32',
    'hour_cos': 'float32',
    'month_sin': 'float32',
    'month_cos': 'float32',
    'precipitation_category': 'category',
    'temp_wind_interaction': 'float32',
    'days_since_last_holiday': 'uint16',
    'days_until_next_holiday': 'uint16',
    'ridership_lag_1h': 'float32',
    'ridership_lag_24h': 'float32',
    'ridership_lag_1w': 'float32',
    'ridership_roll_mean_7d': 'float32'
}

# Convertimos los booleanos a 0 y 1
df_model['is_weekend'] = df_model['is_weekend'].astype('uint8')
df_model['is_holiday'] = df_model['is_holiday'].astype('uint8')

# Primero convertimos los float a int, y luego aplicamos el esquema
df_model['days_since_last_holiday'] = df_model['days_since_last_holiday'].astype('int64')
df_model['days_until_next_holiday'] = df_model['days_until_next_holiday'].astype('int64')

# Ahora, aplicamos el esquema final sin errores
df_model = df_model.astype(final_schema)

print("\nDataFrame con tipos de datos optimizados:")
df_model.info()


DataFrame con tipos de datos optimizados:
<class 'pandas.core.frame.DataFrame'>
Index: 19128 entries, 168 to 19631
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   bus_route                19128 non-null  category
 1   ridership_total          19128 non-null  uint32  
 2   is_weekend               19128 non-null  uint8   
 3   is_holiday               19128 non-null  uint8   
 4   temperature_2m           19128 non-null  float32 
 5   relative_humidity_2m     19128 non-null  float32 
 6   weather_code             19128 non-null  uint8   
 7   wind_speed_10m           19128 non-null  float32 
 8   day_of_week_sin          19128 non-null  float32 
 9   day_of_week_cos          19128 non-null  float32 
 10  hour_sin                 19128 non-null  float32 
 11  hour_cos                 19128 non-null  float32 
 12  month_sin                19128 non-null  float32 
 13  month_cos            

In [5]:
# --- PASO 8: Guardar el Dataset Final ---

# Define la ruta para el nuevo fichero
final_features_path = "../data/model_features_v1.parquet"

# Guarda el DataFrame en formato Parquet
df_model.to_parquet(final_features_path, index=False)

print(f"\n✅ ¡Éxito! Dataset final con {len(df_model.columns)} características guardado en: {final_features_path}")


✅ ¡Éxito! Dataset final con 22 características guardado en: ../data/model_features_v1.parquet
