In [5]:
# 04_transform_ts_data_into_features_and_targets_all_months_with_id_with_name.py

import pandas as pd
import numpy as np
from pathlib import Path

def transform_ts_data_into_features_and_targets_all_months(
    input_dir="../data/processed/timeseries", 
    output_dir="../data/processed/feature_eng_all_id"
):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # List all months to process separately
    months = [
        (2024, 1), (2024, 2), (2024, 3), (2024, 4),
        (2024, 5), (2024, 6), (2024, 7), (2024, 8),
        (2024, 9), (2024, 10), (2024, 11), (2024, 12),
        (2025, 1), (2025, 2), (2025, 3)
    ]

    for year, month in months:
        file_path = input_path / f"rides_{year}_{month:02}.parquet"

        if not file_path.exists():
            print(f"⚠️ Skipping {year}-{month:02} (File not found)")
            continue

        print(f"\n🔵 Loading file: {file_path}")
        df = pd.read_parquet(file_path)

        # ✅ Focus only on top 5 busiest stations for this month
        top_station_ids = (
            df.groupby("start_station_id")["ride_count"]
            .sum()
            .sort_values(ascending=False)
            .head(5)
            .index.tolist()
        )

        # Find corresponding station names
        id_to_name = df[df["start_station_id"].isin(top_station_ids)].groupby("start_station_id")["start_station_name"].first()

        print("✅ Using top 5 stations for", f"{year}-{month:02}:")
        for station_id in top_station_ids:
            station_name = id_to_name.get(station_id, "Unknown")
            print(f"ID: {station_id} → Station Name: {station_name}")

        # Filter only top stations
        df = df[df["start_station_id"].isin(top_station_ids)].copy()

        # Sort by time
        df = df.sort_values("hour_ts").reset_index(drop=True)

        # 3-hour rolling mean (shifted globally)
        df["ride_count_roll3"] = df["ride_count"].shift(1).rolling(3, min_periods=1).mean()

        # Target variable (8 hours ahead ride_count)
        df["target_ride_count"] = df["ride_count"].shift(-8)

        # Drop missing rows (caused by rolling and shifting)
        df = df.dropna(subset=["ride_count_roll3", "target_ride_count"])

        print(f"✅ Final shape for {year}-{month:02}: {df.shape}")

        # Save final dataset for this month
        final_save_path = output_path / f"citibike_features_targets_8hours_{year}_{month:02}.parquet"
        df.to_parquet(final_save_path, index=False)

        print(f"✅ Saved monthly feature dataset at: {final_save_path}")

# Run the function
if __name__ == "__main__":
    transform_ts_data_into_features_and_targets_all_months()



🔵 Loading file: ..\data\processed\timeseries\rides_2024_01.parquet
✅ Using top 5 stations for 2024-01:
ID: 6140 → Station Name: W 21 St & 6 Ave
ID: 6450 → Station Name: 8 Ave & W 31 St
ID: 5788 → Station Name: Lafayette St & E 8 St
ID: 5905 → Station Name: University Pl & E 14 St
ID: 5329 → Station Name: West St & Chambers St
✅ Final shape for 2024-01: (3706, 16)
✅ Saved monthly feature dataset at: ..\data\processed\feature_eng_all_id\citibike_features_targets_8hours_2024_01.parquet

🔵 Loading file: ..\data\processed\timeseries\rides_2024_02.parquet
✅ Using top 5 stations for 2024-02:
ID: 6140 → Station Name: W 21 St & 6 Ave
ID: 6450 → Station Name: 8 Ave & W 31 St
ID: 5788 → Station Name: Lafayette St & E 8 St
ID: 5905 → Station Name: University Pl & E 14 St
ID: 5329 → Station Name: West St & Chambers St
✅ Final shape for 2024-02: (3461, 16)
✅ Saved monthly feature dataset at: ..\data\processed\feature_eng_all_id\citibike_features_targets_8hours_2024_02.parquet

🔵 Loading file: ..\da

In [8]:
import pandas as pd

# Load the saved parquet file
df_saved = pd.read_parquet("../data\processed/feature_eng_all_id/citibike_features_targets_8hours_2024_02.parquet")

# Check shape
print(f"✅ Loaded saved dataframe shape: {df_saved.shape}")

# Check columns
print(f"✅ Number of columns: {len(df_saved.columns)}")
print(f"✅ Some columns: {df_saved.columns[:10].tolist()} ...")


✅ Loaded saved dataframe shape: (3461, 16)
✅ Number of columns: 16
✅ Some columns: ['hour_ts', 'start_station_name', 'start_station_id', 'ride_count', 'hour', 'hour_sin', 'hour_cos', 'day_of_week', 'is_holiday_or_weekend', 'month'] ...


  df_saved = pd.read_parquet("../data\processed/feature_eng_all_id/citibike_features_targets_8hours_2024_02.parquet")
