In [None]:
import genet  
from genet import read_gtfs
from genet import read_matsim

# Read in the GTFS feed using genet

Genet expects specific columns such as `route_color` to be present in the GTFS feed. We add this column as it does not exist in the original GTFS feed. 

In [29]:
import pandas as pd
import zipfile
import os
import shutil

def assign_colors_by_agency_number(routes_df):
    """
    Assign unique colors to routes based on agency_id using sequential numbering.
    Only adds route_color (not route_text_color).
    """
    
    # Color palette - 6-digit hex codes (no # symbol for GTFS)
    color_palette = [
        '0078D4',  # Blue
        'FF6B35',  # Orange  
        '10B981',  # Green
        'EF4444',  # Red
        '8B5CF6',  # Purple
        'F59E0B',  # Amber
        'EC4899',  # Pink
        '00BCF2',  # Light Blue
        '8B5A2B',  # Brown
        '6B7280',  # Gray
    ]
    
    # Get unique agencies and sort them
    unique_agencies = sorted(routes_df['agency_id'].unique())
    print(f"Found {len(unique_agencies)} agencies: {unique_agencies}")
    
    # Create agency to color mapping
    agency_color_map = {}
    
    for i, agency in enumerate(unique_agencies):
        agency_number = i + 1
        color_index = i % len(color_palette)
        route_color = color_palette[color_index]
        agency_color_map[agency] = route_color
        print(f"  Agency {agency_number}: {agency} -> #{route_color}")
    
    # Apply colors to routes
    routes_df['route_color'] = routes_df['agency_id'].map(agency_color_map)
    
    return routes_df

def prepare_gtfs_with_auto_colors(zip_path, extract_dir='./gtfs_extracted'):
    """
    Extract GTFS zip file and automatically assign route_color to agencies by number.
    """
    
    # Clean up and create extraction directory
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)
    os.makedirs(extract_dir)
    
    # Extract zip file
    print(f"Extracting {zip_path} to {extract_dir}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        print(f"Extracted {len(zip_ref.namelist())} files")
    
    # Fix routes.txt - add route_color based on agency numbering
    routes_file = os.path.join(extract_dir, 'routes.txt')
    if os.path.exists(routes_file):
        routes_df = pd.read_csv(routes_file)
        
        print(f"\nOriginal routes.txt:")
        print(f"  Columns: {routes_df.columns.tolist()}")
        print(f"  Number of routes: {len(routes_df)}")
        
        # Add colors based on automatic agency numbering
        routes_df = assign_colors_by_agency_number(routes_df)
        
        # Save the updated file
        routes_df.to_csv(routes_file, index=False)
        print(f"\nUpdated routes.txt saved with route_color column")
        
        # Show a sample of the updated data
        print("\nSample of updated routes:")
        sample_cols = ['route_id', 'agency_id', 'route_short_name', 'route_color']
        print(routes_df[sample_cols].head(10).to_string(index=False))
    
    print(f"\nGTFS data prepared successfully in: {extract_dir}")
    return extract_dir

# COMPLETE WORKFLOW: Extract, fix, and load GTFS
zip_path = '../data/external/study_area_gtfs_bus.zip'

# Step 1: Prepare the GTFS data (extract and add colors)
gtfs_dir = prepare_gtfs_with_auto_colors(zip_path)

# Step 2: Load with genet
print("\nLoading GTFS with genet...")
feed = genet.read_gtfs(gtfs_dir, '20230814')
print("✅ GTFS loaded successfully!")

Extracting ../data/external/study_area_gtfs_bus.zip to ./gtfs_extracted


2025-07-30 15:17:40,043 - Reading GTFS from ./gtfs_extracted
2025-07-30 15:17:40,044 - Reading the calendar for GTFS
2025-07-30 15:17:40,045 - Reading GTFS data into usable format
2025-07-30 15:17:40,045 - Reading stops
2025-07-30 15:17:40,057 - Reading stop times


Extracted 10 files

Original routes.txt:
  Columns: ['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']
  Number of routes: 187
Found 24 agencies: ['OP134', 'OP192', 'OP2350', 'OP318', 'OP5050', 'OP5051', 'OP564', 'OP658', 'OP662', 'OP664', 'OP665', 'OP666', 'OP671', 'OP672', 'OP6801', 'OP8944', 'OP8945', 'OP929', 'OP930', 'OP931', 'OP932', 'OP933', 'OP937', 'OP938']
  Agency 1: OP134 -> #0078D4
  Agency 2: OP192 -> #FF6B35
  Agency 3: OP2350 -> #10B981
  Agency 4: OP318 -> #EF4444
  Agency 5: OP5050 -> #8B5CF6
  Agency 6: OP5051 -> #F59E0B
  Agency 7: OP564 -> #EC4899
  Agency 8: OP658 -> #00BCF2
  Agency 9: OP662 -> #8B5A2B
  Agency 10: OP664 -> #6B7280
  Agency 11: OP665 -> #0078D4
  Agency 12: OP666 -> #FF6B35
  Agency 13: OP671 -> #10B981
  Agency 14: OP672 -> #EF4444
  Agency 15: OP6801 -> #8B5CF6
  Agency 16: OP8944 -> #F59E0B
  Agency 17: OP8945 -> #EC4899
  Agency 18: OP929 -> #00BCF2
  Agency 19: OP930 -> #8B5A2B
  Agency 20: OP931 -> #6B7280
  Agen

2025-07-30 15:17:40,986 - Reading routes
2025-07-30 15:17:40,989 - Reading trips
2025-07-30 15:17:44,150 - Your GTFS has (a) looooop edge(s)! A zero link between a node and itself, edge affected 
This edge will not be considered for computation, the stop will be deleted and the schedule will be changed. Affected stops: ['450012625']
2025-07-30 15:17:44,207 - Your GTFS has (a) looooop edge(s)! A zero link between a node and itself, edge affected 
This edge will not be considered for computation, the stop will be deleted and the schedule will be changed. Affected stops: ['450026240']
2025-07-30 15:17:44,251 - Your GTFS has (a) looooop edge(s)! A zero link between a node and itself, edge affected 
This edge will not be considered for computation, the stop will be deleted and the schedule will be changed. Affected stops: ['450014725']
2025-07-30 15:17:44,304 - Your GTFS has (a) looooop edge(s)! A zero link between a node and itself, edge affected 
This edge will not be considered for compu

✅ GTFS loaded successfully!


# Prepare data for optimisation

In [46]:
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from datetime import datetime, timedelta
from abc import ABC, abstractmethod

# =============================================================================
# CORE DATA STRUCTURES
# =============================================================================

@dataclass
class RouteConfig:
    """
    Immutable configuration for a single transit route/service.
    
    This class represents a single transit route with time-varying headways
    across different periods of the day. Each route is divided into time
    intervals (e.g., 3-hour windows) with potentially different headway values.
    
    Attributes:
        service_id (str): Unique GTFS service identifier
        route_name (str): Human-readable route name for display
        agency_id (str): Transit agency that operates this route
        headways_by_interval (np.ndarray): Headway in minutes for each time interval
                                         (np.nan indicates no service in that interval)
        min_headway (float): Minimum operationally feasible headway (minutes)
        max_headway (float): Maximum operationally reasonable headway (minutes)
        operating_hours (Tuple[int, int]): Typical service hours (start_hour, end_hour)
        route_color (str): Hex color code for visualization (without #)
        interval_hours (int): Duration of each time interval in hours
        round_trip_time (float): Complete round-trip time in minutes (for vehicle calculations)
    
    Example:
        A route with 3-hour intervals might have:
        headways_by_interval = [30, 15, 15, 20, 30, 45, np.nan, np.nan]
        This means: 30min headway 00-03h, 15min 03-06h, ..., no service 18-24h
    """
    service_id: str
    route_name: str
    agency_id: str
    headways_by_interval: np.ndarray
    min_headway: float
    max_headway: float
    operating_hours: Tuple[int, int]
    route_color: str
    interval_hours: int
    round_trip_time: float

# =============================================================================
# GTFS DATA EXTRACTION LAYER
# =============================================================================

class GTFSDataExtractor:
    """
    Handles extraction and processing of transit data from GTFS feeds via Genet.
    
    This class is responsible for:
    1. Extracting headway statistics from GTFS data
    2. Calculating time-varying headways across different time periods
    3. Computing route metadata (names, agencies, colors)
    4. Estimating round-trip times from trip data
    5. Handling fallback scenarios when data is incomplete
    
    The extraction process works as follows:
    1. Use genet's headway_stats() to get basic route information
    2. For each route, analyze trips_to_dataframe() to calculate time-varying headways
    3. Split the day into intervals (e.g., 3-hour windows)
    4. Calculate average headway within each interval based on actual trip departures
    5. Handle edge cases (no trips, single trip, data errors)
    """
    
    def __init__(self, 
                 genet_feed,
                 gtfs_day: str = "20230814",
                 interval_hours: int = 3,
                 fallback_headway: float = 30.0,
                 default_round_trip_time: float = 60.0):
        """
        Initialize the GTFS data extractor.
        
        Args:
            genet_feed: Genet feed object containing GTFS data
            gtfs_day: Date for analysis in YYYYMMDD format
            interval_hours: Duration of each time interval (must divide 24 evenly)
            fallback_headway: Default headway when calculation fails
            default_round_trip_time: Default round-trip time when calculation fails
        """
        if 24 % interval_hours != 0:
            raise ValueError(f"interval_hours ({interval_hours}) must divide 24 evenly")
            
        self.feed = genet_feed
        self.gtfs_day = gtfs_day
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.fallback_headway = fallback_headway
        self.default_round_trip_time = default_round_trip_time
    
    def extract_all_routes(self) -> List[Dict]:
        """
        Extract headway and metadata for all routes in the GTFS feed.
        
        This is the main entry point for data extraction. It:
        1. Gets basic headway statistics from genet
        2. Filters to valid routes (those with meaningful headway data)
        3. For each route, calculates detailed time-varying headways
        4. Extracts metadata (names, agencies, colors)
        5. Estimates round-trip times for vehicle calculations
        
        Returns:
            List[Dict]: List of dictionaries containing route data
                       Each dict has keys: service_id, route_id, avg_headway,
                       headways_by_interval, route_name, agency_id, route_color,
                       round_trip_time
        
        Raises:
            Exception: If extraction fails completely, falls back to simple uniform headways
        """
        try:
            print(f"Extracting GTFS data with {self.interval_hours}-hour intervals...")
            
            # Step 1: Get basic headway statistics from genet
            headway_df = self._get_basic_headway_stats()
            
            # Step 2: Process each route to get detailed time-varying data
            route_data_list = []
            for _, row in headway_df.iterrows():
                route_data = self._process_single_route(row)
                if route_data:  # Only add if processing succeeded
                    route_data_list.append(route_data)
            
            print(f"✅ Successfully extracted {len(route_data_list)} routes")
            return route_data_list
            
        except Exception as e:
            print(f"❌ GTFS extraction failed: {e}")
            print("Using fallback uniform headways...")
            return self._create_fallback_routes()
    
    def _get_basic_headway_stats(self) -> pd.DataFrame:
        """
        Get basic headway statistics from the genet feed.
        
        Uses genet's built-in headway_stats() method to get daily average
        headways for all routes. Filters out routes without valid headway data.
        
        Returns:
            pd.DataFrame: Routes with columns including service_id, route_id, 
                         mean_headway_mins, trip_count
        """
        headway_df = self.feed.headway_stats(gtfs_day=self.gtfs_day)
        
        # Include ALL routes with non-null headways (no arbitrary filtering)
        valid_routes = headway_df[headway_df['mean_headway_mins'].notna()].copy()
        
        print(f"Found {len(valid_routes)} routes with headway data")
        return valid_routes
    
    def _process_single_route(self, row: pd.Series) -> Optional[Dict]:
        """
        Process a single route to extract detailed time-varying headway data.
        
        For each route, this method:
        1. Calculates time-varying headways by analyzing trip departure times
        2. Extracts route metadata (name, agency, color)
        3. Estimates round-trip time from trip durations
        4. Packages everything into a dictionary for later processing
        
        Args:
            row: Pandas Series containing basic route info from headway_stats
            
        Returns:
            Dict or None: Route data dictionary, or None if processing failed
        """
        try:
            service_id = row['service_id']
            route_id = row.get('route_id', service_id)
            avg_headway = row['mean_headway_mins']
            
            # Calculate time-varying headways (core functionality)
            headways_by_interval = self._calculate_time_varying_headways(service_id, avg_headway)
            
            # Extract metadata
            route_name, agency_id, route_color = self._extract_route_metadata(route_id)
            
            # Calculate round-trip time for vehicle planning
            round_trip_time = self._calculate_round_trip_time(service_id)
            
            return {
                'service_id': service_id,
                'route_id': route_id,
                'avg_headway': avg_headway,
                'headways_by_interval': headways_by_interval,
                'route_name': route_name,
                'agency_id': agency_id,
                'route_color': route_color,
                'round_trip_time': round_trip_time
            }
            
        except Exception as e:
            print(f"⚠️  Failed to process route {row.get('service_id', 'unknown')}: {e}")
            return None
    
    def _calculate_time_varying_headways(self, service_id: str, avg_headway: float) -> np.ndarray:
        """
        Calculate headway values for each time interval throughout the day.
        
        This is the core algorithm for time-varying headway calculation:
        
        1. Get all trips for this service from GTFS data
        2. Extract departure hour from each trip
        3. For each time interval (e.g., 0-3h, 3-6h, etc.):
           a. Find all trips departing in that interval
           b. Sort trips by departure time
           c. Calculate time differences between consecutive trips
           d. Average these intervals to get headway for that period
        4. Handle special cases:
           - No trips → np.nan (no service)
           - One trip → interval duration (infrequent service)
           - Multiple trips → calculated average interval
        
        Args:
            service_id: GTFS service identifier
            avg_headway: Daily average headway as fallback
            
        Returns:
            np.ndarray: Headway values for each time interval (np.nan = no service)
            
        Example:
            For 3-hour intervals, returns array of length 8:
            [headway_00-03h, headway_03-06h, ..., headway_21-24h]
        """
        headways = np.full(self.n_intervals, np.nan)
        
        try:
            # Get trip data for this service
            trips_df = self.feed.trips_to_dataframe(gtfs_day=self.gtfs_day)
            service_trips = trips_df[trips_df['service_id'] == service_id].copy()
            
            # Handle edge cases
            if len(service_trips) == 0:
                return headways  # All np.nan - no service
            elif len(service_trips) == 1:
                # Single trip - assume it repeats every interval
                headways.fill(self.interval_hours * 60)  # Convert hours to minutes
                return headways
            
            # Extract hour component for interval assignment
            service_trips['departure_hour'] = service_trips['trip_departure_time'].dt.hour
            
            # Calculate headway for each time interval
            for interval in range(self.n_intervals):
                start_hour = interval * self.interval_hours
                end_hour = (interval + 1) * self.interval_hours
                
                # Find trips in this time window
                interval_trips = service_trips[
                    (service_trips['departure_hour'] >= start_hour) &
                    (service_trips['departure_hour'] < end_hour)
                ].copy()
                
                # Calculate headway for this specific interval
                headways[interval] = self._calculate_interval_headway(interval_trips, avg_headway)
            
            return headways
            
        except Exception as e:
            print(f"Failed to calculate time-varying headways for {service_id}: {e}")
            # Fallback: uniform headway across all intervals
            headways.fill(avg_headway)
            return headways
    
    def _calculate_interval_headway(self, interval_trips: pd.DataFrame, avg_headway: float) -> float:
        """
        Calculate headway for a specific time interval based on trip departures.
        
        This method handles the actual headway calculation for a single time period:
        
        For multiple trips:
        1. Sort trips by departure time
        2. Calculate time difference to next trip for each trip
        3. Filter out invalid intervals (≤0, outliers)
        4. Return average of valid intervals
        
        For single trip:
        - Return interval duration (assumes infrequent service)
        
        For no trips:
        - Return np.nan (no service)
        
        Args:
            interval_trips: DataFrame of trips in this time interval
            avg_headway: Fallback headway if calculation fails
            
        Returns:
            float: Calculated headway in minutes, or np.nan for no service
        """
        if len(interval_trips) >= 2:
            # Multiple trips - calculate actual intervals
            interval_trips = interval_trips.sort_values('trip_departure_time')
            
            # Calculate time to next departure for each trip
            interval_trips['next_departure'] = interval_trips['trip_departure_time'].shift(-1)
            interval_trips['interval_mins'] = (
                interval_trips['next_departure'] - interval_trips['trip_departure_time']
            ).dt.total_seconds() / 60
            
            # Filter valid intervals (positive, non-zero)
            valid_intervals = interval_trips['interval_mins'].dropna()
            valid_intervals = valid_intervals[valid_intervals > 0]
            
            if len(valid_intervals) > 0:
                return valid_intervals.mean()
            else:
                return avg_headway  # Fallback if no valid intervals
                
        elif len(interval_trips) == 1:
            # Single trip - assume it represents infrequent service
            return self.interval_hours * 60  # Convert hours to minutes
        else:
            # No trips - no service in this interval
            return np.nan
    
    def _extract_route_metadata(self, route_id: str) -> Tuple[str, str, str]:
        """
        Extract route metadata from the GTFS feed.
        
        Attempts to get route name, agency, and color from the genet route object.
        Provides sensible defaults if data is missing.
        
        Args:
            route_id: GTFS route identifier
            
        Returns:
            Tuple[str, str, str]: (route_name, agency_id, route_color)
        """
        try:
            route = self.feed.route(route_id)
            route_name = getattr(route, 'route_short_name', str(route_id))
            agency_id = getattr(route, 'agency_id', 'Unknown')
            route_color = getattr(route, 'route_color', '000000')
        except Exception as e:
            print(f"⚠️  Could not extract metadata for route {route_id}: {e}")
            route_name = str(route_id)
            agency_id = 'Unknown'
            route_color = '000000'
            
        return route_name, agency_id, route_color
    
    def _calculate_round_trip_time(self, service_id: str) -> float:
        """
        Estimate round-trip time by analyzing trip durations.
        
        This calculation is important for vehicle scheduling:
        vehicles_needed = round_trip_time / headway
        
        Process:
        1. Get all trips for this service
        2. Calculate duration of each trip (end_time - start_time)
        3. Take median duration as representative one-way time
        4. Estimate round-trip as 2 × one-way + 10% buffer for turnaround
        
        Args:
            service_id: GTFS service identifier
            
        Returns:
            float: Estimated round-trip time in minutes
        """
        try:
            trips_df = self.feed.trips_to_dataframe(gtfs_day=self.gtfs_day)
            service_trips = trips_df[trips_df['service_id'] == service_id].copy()
            
            if len(service_trips) == 0:
                return self.default_round_trip_time
            
            # Calculate individual trip durations
            service_trips['trip_duration'] = (
                service_trips['trip_end_time'] - service_trips['trip_departure_time']
            ).dt.total_seconds() / 60  # Convert to minutes
            
            # Filter to valid durations
            valid_durations = service_trips['trip_duration'].dropna()
            valid_durations = valid_durations[valid_durations > 0]
            
            if len(valid_durations) > 0:
                # Use median one-way time (robust to outliers)
                one_way_time = valid_durations.median()
                # Round-trip = 2 × one-way + 10% buffer for turnaround time
                round_trip_time = one_way_time * 2 * 1.1
                return round_trip_time
            else:
                return self.default_round_trip_time
                
        except Exception as e:
            print(f"⚠️  Failed to calculate round-trip time for {service_id}: {e}")
            return self.default_round_trip_time
    
    def _create_fallback_routes(self) -> List[Dict]:
        """
        Create simple uniform headway routes when extraction fails.
        
        This ensures the system can still operate even if detailed GTFS
        analysis fails. Creates routes with uniform headways across all intervals.
        
        Returns:
            List[Dict]: Simple route data with uniform headways
        """
        services_list = list(self.feed.services())
        fallback_routes = []
        
        for service in services_list:
            headways = np.full(self.n_intervals, self.fallback_headway)
            
            route_data = {
                'service_id': service.id,
                'route_id': service.id,
                'avg_headway': self.fallback_headway,
                'headways_by_interval': headways,
                'route_name': getattr(service, 'route_short_name', service.id),
                'agency_id': getattr(service, 'agency_id', 'Unknown'),
                'route_color': getattr(service, 'route_color', '000000'),
                'round_trip_time': self.default_round_trip_time
            }
            
            fallback_routes.append(route_data)
        
        return fallback_routes

# =============================================================================
# OPTIMIZATION CONSTRAINT MANAGER
# =============================================================================

class OptimizationConstraints:
    """
    Manages optimization bounds and constraints for headway variables.
    
    This class handles:
    1. Setting bounds for individual route headway variables
    2. Enforcing user-specified global constraints (min/max headways)
    3. Calculating data-driven bounds based on existing service patterns
    4. Converting between different headway representations (np.nan ↔ large values)
    
    The constraint system works as follows:
    - Each route has per-interval headway variables
    - User can set global min/max headway limits
    - Bounds can also be calculated from existing data (e.g., 50%-200% of current)
    - No-service periods (np.nan) are converted to large values for optimization
    """
    
    def __init__(self,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 no_service_headway_value: float = 9999.0):
        """
        Initialize constraint manager with user preferences.
        
        Args:
            user_min_headway: Absolute minimum headway (minutes)
            user_max_headway: Absolute maximum headway (minutes)
            min_headway_multiplier: Factor for data-driven lower bounds
            max_headway_multiplier: Factor for data-driven upper bounds
            no_service_headway_value: Large value representing no service
        """
        self.user_min_headway = user_min_headway
        self.user_max_headway = user_max_headway
        self.min_headway_multiplier = min_headway_multiplier
        self.max_headway_multiplier = max_headway_multiplier
        self.no_service_headway_value = no_service_headway_value
    
    def calculate_route_bounds(self, headways_by_interval: np.ndarray) -> Tuple[float, float]:
        """
        Calculate optimization bounds for a single route.
        
        Combines user-specified constraints with data-driven bounds:
        1. Calculate data-driven bounds from existing headways
        2. Enforce user-specified global limits
        3. Ensure min ≤ max
        
        Args:
            headways_by_interval: Current headway values for the route
            
        Returns:
            Tuple[float, float]: (min_headway, max_headway) for this route
        """
        valid_headways = headways_by_interval[~np.isnan(headways_by_interval)]
        
        if len(valid_headways) > 0:
            # Calculate bounds based on existing data
            data_min = np.min(valid_headways) * self.min_headway_multiplier
            data_max = np.max(valid_headways) * self.max_headway_multiplier
            
            # Combine with user constraints (most restrictive wins)
            min_headway = max(self.user_min_headway, data_min)
            max_headway = max(min_headway, min(self.user_max_headway, data_max))
        else:
            # No existing service - use user bounds
            min_headway = self.user_min_headway
            max_headway = self.user_max_headway
            
        return min_headway, max_headway
    
    def create_optimization_bounds(self, routes: List[RouteConfig]) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create bounds arrays for all optimization variables.
        
        The optimization vector is structured as:
        [route1_interval1, route1_interval2, ..., route1_intervalN,
         route2_interval1, route2_interval2, ..., route2_intervalN, ...]
        
        This method creates corresponding min/max bound arrays.
        
        Args:
            routes: List of route configurations
            
        Returns:
            Tuple[np.ndarray, np.ndarray]: (min_bounds, max_bounds) arrays
        """
        min_bounds = []
        max_bounds = []
        
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                # All intervals use the same route-level bounds
                # (optimization can decide whether to provide service)
                min_bounds.append(route.min_headway)
                max_bounds.append(route.max_headway)
        
        return np.array(min_bounds), np.array(max_bounds)
    
    def convert_to_optimization_vector(self, routes: List[RouteConfig]) -> np.ndarray:
        """
        Convert route headways to flat optimization vector.
        
        Handles the np.nan → large value conversion needed for optimization.
        Most optimization algorithms can't handle NaN values, so we convert
        no-service periods to a large headway value.
        
        Args:
            routes: List of route configurations
            
        Returns:
            np.ndarray: Flattened headway vector for optimization
        """
        all_headways = []
        for route in routes:
            for headway in route.headways_by_interval:
                if np.isnan(headway):
                    all_headways.append(self.no_service_headway_value)
                else:
                    all_headways.append(headway)
        return np.array(all_headways)
    
    def convert_from_optimization_vector(self, headway_vector: np.ndarray, 
                                       routes: List[RouteConfig]) -> None:
        """
        Update route configurations from optimization vector.
        
        Converts large headway values back to np.nan and updates the
        route configurations in-place.
        
        Args:
            headway_vector: Optimized headway values
            routes: List of route configurations to update
        """
        expected_length = sum(len(route.headways_by_interval) for route in routes)
        if len(headway_vector) != expected_length:
            raise ValueError(f"Vector length mismatch: {len(headway_vector)} vs {expected_length}")
        
        idx = 0
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                new_headway = headway_vector[idx]
                
                # Convert large values back to no service
                if new_headway >= self.no_service_headway_value * 0.9:
                    route.headways_by_interval[interval] = np.nan
                else:
                    route.headways_by_interval[interval] = new_headway
                    
                idx += 1

# =============================================================================
# VEHICLE CONSTRAINT CALCULATOR
# =============================================================================

class VehicleConstraintCalculator:
    """
    Calculates vehicle requirements and enforces fleet constraints.
    
    This class handles the relationship between headways and vehicle needs:
    vehicles_needed = round_trip_time / headway
    
    Key constraints supported:
    1. Total fleet size limits
    2. Percentage increase limits (vs baseline)
    3. Per-agency fleet limits
    4. Time-interval specific limits
    
    The calculation works as follows:
    1. For each route and time interval, calculate vehicles needed
    2. Sum across routes to get total vehicles by interval
    3. Peak interval determines total fleet size needed
    4. Check against various constraint types
    """
    
    def __init__(self, routes: List[RouteConfig], no_service_threshold: float = 9999.0):
        """
        Initialize vehicle calculator with route data.
        
        Args:
            routes: List of route configurations
            no_service_threshold: Threshold above which headway = no service
        """
        self.routes = routes
        self.no_service_threshold = no_service_threshold
        self.n_intervals = len(routes[0].headways_by_interval) if routes else 0
        
        # Calculate baseline vehicle requirements
        self.baseline_vehicles_by_interval = self._calculate_baseline_vehicles()
        self.baseline_total_vehicles = np.max(self.baseline_vehicles_by_interval)
    
    def _calculate_baseline_vehicles(self) -> np.ndarray:
        """
        Calculate current vehicle requirements from existing headways.
        
        This establishes the baseline for percentage-based constraints.
        
        Returns:
            np.ndarray: Vehicles needed for each time interval (current schedule)
        """
        vehicles_by_interval = np.zeros(self.n_intervals)
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = route.headways_by_interval[interval]
                if not np.isnan(headway) and headway > 0:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
        
        return vehicles_by_interval
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        """
        Calculate vehicle requirements for each time interval.
        
        This is the core vehicle calculation:
        For each route and interval:
        - If service operates: vehicles = round_trip_time / headway
        - If no service: vehicles = 0
        Sum across all routes for each interval.
        
        Args:
            headway_vector: Flattened optimization vector
            
        Returns:
            np.ndarray: Total vehicles needed for each time interval
        """
        vehicles_by_interval = np.zeros(self.n_intervals)
        idx = 0
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                # Only count vehicles for active service
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
                
                idx += 1
        
        return vehicles_by_interval
    
    def constraint_total_fleet_size(self, headway_vector: np.ndarray, 
                                  max_fleet_size: float) -> float:
        """
        Constraint: total fleet size ≤ max_fleet_size.
        
        Args:
            headway_vector: Optimization vector
            max_fleet_size: Maximum allowed total vehicles
            
        Returns:
            float: Constraint value (≥ 0 for feasible solutions)
        """
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        return max_fleet_size - peak_vehicles
    
    def constraint_percentage_increase(self, headway_vector: np.ndarray, 
                                     max_increase_percent: float) -> float:
        """
        Constraint: fleet size increase ≤ max_increase_percent.
        
        Useful for budget-constrained optimization where you want to limit
        the increase in vehicle requirements relative to current operations.
        
        Args:
            headway_vector: Optimization vector
            max_increase_percent: Maximum allowed increase (e.g., 20.0 for 20%)
            
        Returns:
            float: Constraint value (≥ 0 for feasible solutions)
        """
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        
        max_allowed = self.baseline_total_vehicles * (1 + max_increase_percent / 100)
        return max_allowed - peak_vehicles
    
    def constraint_agency_specific(self, headway_vector: np.ndarray, 
                                 agency_limits: Dict[str, float]) -> List[float]:
        """
        Constraint: per-agency fleet limits.
        
        Useful when different agencies have different vehicle budgets
        or operational constraints.
        
        Args:
            headway_vector: Optimization vector
            agency_limits: Maximum vehicles per agency {agency_id: limit}
            
        Returns:
            List[float]: Constraint values for each agency (≥ 0 for feasible)
        """
        # Calculate vehicles by agency and interval
        agency_vehicles = {}
        idx = 0
        
        for route in self.routes:
            agency_id = route.agency_id
            if agency_id not in agency_vehicles:
                agency_vehicles[agency_id] = np.zeros(self.n_intervals)
            
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    agency_vehicles[agency_id][interval] += vehicles_needed
                
                idx += 1
        
        # Check constraints for each agency
        constraints = []
        for agency_id, limit in agency_limits.items():
            if agency_id in agency_vehicles:
                peak_vehicles = np.max(agency_vehicles[agency_id])
                constraints.append(limit - peak_vehicles)
            else:
                constraints.append(limit)  # No vehicles used = always feasible
        
        return constraints

# =============================================================================
# MAIN OPTIMIZATION DATA STRUCTURE
# =============================================================================

class HeadwayOptimizationData:
    """
    Main class that coordinates all components for headway optimization.
    
    This class brings together:
    - GTFS data extraction (GTFSDataExtractor)
    - Route configuration management (RouteConfig)
    - Optimization constraints (OptimizationConstraints)
    - Vehicle calculations (VehicleConstraintCalculator)
    
    Responsibilities:
    1. Initialize and coordinate all sub-components
    2. Provide a clean interface for optimization algorithms
    3. Handle data conversion between different representations
    4. Generate summary reports and diagnostics
    
    Usage:
        opt_data = HeadwayOptimizationData(feed, gtfs_day="20230814")
        
        # Get data for optimization
        x = opt_data.get_optimization_vector()
        bounds = opt_data.get_bounds()
        
        # Use with optimization algorithm
        result = optimizer.minimize(objective, x, bounds=bounds)
        
        # Update with results
        opt_data.set_headways(result.x)
    """
    
    def __init__(self, 
                 genet_feed, 
                 gtfs_day: str = "20230814", 
                 interval_hours: int = 3,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 default_operating_hours: Tuple[int, int] = (6, 22),
                 fallback_headway: float = 30.0,
                 no_service_headway_value: float = 9999.0,
                 default_round_trip_time: float = 60.0):
        """
        Initialize the complete optimization data structure.
        
        This constructor coordinates the initialization of all sub-components
        and builds the complete data structure needed for optimization.
        
        Args:
            genet_feed: Genet feed object containing GTFS data
            gtfs_day: Analysis date in YYYYMMDD format
            interval_hours: Duration of each time interval (must divide 24 evenly)
            user_min_headway: Absolute minimum headway constraint (minutes)
            user_max_headway: Absolute maximum headway constraint (minutes)
            min_headway_multiplier: Factor for data-driven lower bounds
            max_headway_multiplier: Factor for data-driven upper bounds
            default_operating_hours: Default service hours (start, end)
            fallback_headway: Default headway when calculation fails
            no_service_headway_value: Large value representing no service
            default_round_trip_time: Default round-trip time (minutes)
        """
        # Store configuration
        self.gtfs_day = gtfs_day
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.default_operating_hours = default_operating_hours
        
        # Initialize sub-components
        print("=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===")
        
        # 1. GTFS Data Extraction
        print("1. Extracting GTFS data...")
        self.extractor = GTFSDataExtractor(
            genet_feed=genet_feed,
            gtfs_day=gtfs_day,
            interval_hours=interval_hours,
            fallback_headway=fallback_headway,
            default_round_trip_time=default_round_trip_time
        )
        
        # 2. Extract route data
        route_data_list = self.extractor.extract_all_routes()
        
        # 3. Constraint Management
        print("2. Setting up optimization constraints...")
        self.constraints = OptimizationConstraints(
            user_min_headway=user_min_headway,
            user_max_headway=user_max_headway,
            min_headway_multiplier=min_headway_multiplier,
            max_headway_multiplier=max_headway_multiplier,
            no_service_headway_value=no_service_headway_value
        )
        
        # 4. Create RouteConfig objects
        print("3. Creating route configurations...")
        self.routes = []
        for route_data in route_data_list:
            min_headway, max_headway = self.constraints.calculate_route_bounds(
                route_data['headways_by_interval']
            )
            
            route_config = RouteConfig(
                service_id=route_data['service_id'],
                route_name=route_data['route_name'],
                agency_id=route_data['agency_id'],
                headways_by_interval=route_data['headways_by_interval'],
                min_headway=min_headway,
                max_headway=max_headway,
                operating_hours=default_operating_hours,
                route_color=route_data['route_color'],
                interval_hours=interval_hours,
                round_trip_time=route_data['round_trip_time']
            )
            
            self.routes.append(route_config)
        
        # 5. Vehicle Constraint Calculator
        print("4. Initializing vehicle constraint calculator...")
        self.vehicle_calculator = VehicleConstraintCalculator(
            routes=self.routes,
            no_service_threshold=no_service_headway_value
        )
        
        print(f"✅ Successfully initialized optimization data for {len(self.routes)} routes")
        print(f"   Baseline fleet size: {self.vehicle_calculator.baseline_total_vehicles:.1f} vehicles")
        print()
    
    # =============================================================================
    # CORE OPTIMIZATION INTERFACE
    # =============================================================================
    
    def get_optimization_vector(self) -> np.ndarray:
        """
        Get current headways as a flattened optimization vector.
        
        This is the main interface for optimization algorithms.
        Converts np.nan values to large numbers that optimizers can handle.
        
        Returns:
            np.ndarray: Flattened headway vector for optimization
        """
        return self.constraints.convert_to_optimization_vector(self.routes)
    
    def get_bounds(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get optimization bounds for all headway variables.
        
        Returns:
            Tuple[np.ndarray, np.ndarray]: (lower_bounds, upper_bounds)
        """
        return self.constraints.create_optimization_bounds(self.routes)
    
    def set_headways(self, headway_vector: np.ndarray):
        """
        Update route headways from optimization results.
        
        Args:
            headway_vector: Optimized headway values
        """
        self.constraints.convert_from_optimization_vector(headway_vector, self.routes)
    
    # =============================================================================
    # VEHICLE CONSTRAINT INTERFACE
    # =============================================================================
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        """Calculate vehicle requirements for each time interval."""
        return self.vehicle_calculator.calculate_vehicles_needed(headway_vector)
    
    def vehicle_constraint_total(self, headway_vector: np.ndarray, max_fleet_size: float) -> float:
        """Vehicle constraint: total fleet size ≤ max_fleet_size."""
        return self.vehicle_calculator.constraint_total_fleet_size(headway_vector, max_fleet_size)
    
    def vehicle_constraint_percent_increase(self, headway_vector: np.ndarray, 
                                          max_increase_percent: float) -> float:
        """Vehicle constraint: fleet size increase ≤ max_increase_percent."""
        return self.vehicle_calculator.constraint_percentage_increase(headway_vector, max_increase_percent)
    
    def vehicle_constraint_agency_specific(self, headway_vector: np.ndarray, 
                                         agency_limits: Dict[str, float]) -> List[float]:
        """Vehicle constraint: per-agency fleet limits."""
        return self.vehicle_calculator.constraint_agency_specific(headway_vector, agency_limits)
    
    # =============================================================================
    # SUMMARY AND REPORTING
    # =============================================================================
    
    def get_route_summary(self, include_interval_details: bool = True) -> pd.DataFrame:
        """
        Generate summary table of all routes with their configurations.
        
        Args:
            include_interval_details: Whether to include individual interval columns
            
        Returns:
            pd.DataFrame: Route summary with headway and constraint information
        """
        data = []
        for route in self.routes:
            active_headways = route.headways_by_interval[~np.isnan(route.headways_by_interval)]
            avg_headway = np.mean(active_headways) if len(active_headways) > 0 else np.nan
            
            row_data = {
                'service_id': route.service_id,
                'route_name': route.route_name,
                'agency': route.agency_id,
                'avg_headway': avg_headway,
                'min_headway': route.min_headway,
                'max_headway': route.max_headway,
                'intervals_with_service': np.sum(~np.isnan(route.headways_by_interval)),
                'route_color': route.route_color,
                'round_trip_time': route.round_trip_time
            }
            
            if include_interval_details:
                for i in range(self.n_intervals):
                    start_hour = i * self.interval_hours
                    end_hour = (i + 1) * self.interval_hours
                    headway_val = route.headways_by_interval[i]
                    row_data[f'headway_{start_hour:02d}-{end_hour:02d}h'] = headway_val
            
            data.append(row_data)
        
        return pd.DataFrame(data)
    
    def get_interval_labels(self) -> List[str]:
        """Get human-readable labels for time intervals."""
        labels = []
        for i in range(self.n_intervals):
            start_hour = i * self.interval_hours
            end_hour = (i + 1) * self.interval_hours
            labels.append(f"{start_hour:02d}-{end_hour:02d}h")
        return labels
    
    def get_optimization_info(self) -> Dict:
        """
        Get comprehensive information about the optimization setup.
        
        Returns:
            Dict: Detailed optimization configuration and statistics
        """
        current_vector = self.get_optimization_vector()
        valid_headways = current_vector[current_vector < self.constraints.no_service_headway_value * 0.9]
        no_service_count = len(current_vector) - len(valid_headways)
        
        return {
            'n_routes': len(self.routes),
            'n_intervals': self.n_intervals,
            'interval_hours': self.interval_hours,
            'total_variables': len(current_vector),
            'service_periods': len(valid_headways),
            'no_service_periods': no_service_count,
            'service_coverage_pct': 100 * len(valid_headways) / len(current_vector) if len(current_vector) > 0 else 0,
            'mean_headway': valid_headways.mean() if len(valid_headways) > 0 else np.nan,
            'min_headway': valid_headways.min() if len(valid_headways) > 0 else np.nan,
            'max_headway': valid_headways.max() if len(valid_headways) > 0 else np.nan,
            'std_headway': valid_headways.std() if len(valid_headways) > 0 else np.nan,
            'interval_labels': self.get_interval_labels(),
            'user_min_headway': self.constraints.user_min_headway,
            'user_max_headway': self.constraints.user_max_headway,
            'baseline_total_vehicles': self.vehicle_calculator.baseline_total_vehicles,
            'baseline_vehicles_by_interval': self.vehicle_calculator.baseline_vehicles_by_interval
        }

# =============================================================================
# EXAMPLE USAGE
# =============================================================================

# Create the optimization data structure
print("=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE ===")

opt_data = HeadwayOptimizationData(
    genet_feed=feed,
    gtfs_day="20230814",
    interval_hours=3,
    user_min_headway=5.0,
    user_max_headway=120.0,
    min_headway_multiplier=0.5,
    max_headway_multiplier=2.0,
    default_operating_hours=(6, 22),
    fallback_headway=30.0,
    no_service_headway_value=9999.0,
    default_round_trip_time=60.0
)

# Show core route summary
summary_df = opt_data.get_route_summary(include_interval_details=False)
print("Core route summary:")
print(summary_df[['service_id', 'route_name', 'agency', 'avg_headway', 'round_trip_time']].round(1).head(10))
print()

# Show optimization setup information
opt_info = opt_data.get_optimization_info()
print("Optimization setup:")
print(f"  Total decision variables: {opt_info['total_variables']} ({opt_info['n_routes']} routes × {opt_info['n_intervals']} intervals)")
print(f"  Time intervals: {opt_info['interval_labels']}")
print(f"  Service periods: {opt_info['service_periods']}/{opt_info['total_variables']} ({opt_info['service_coverage_pct']:.1f}%)")
print(f"  User constraints: {opt_info['user_min_headway']:.1f} - {opt_info['user_max_headway']:.1f} minutes")
print(f"  Baseline fleet size: {opt_info['baseline_total_vehicles']:.1f} vehicles")
print()

# Test the core optimization interface
current_vector = opt_data.get_optimization_vector()
min_bounds, max_bounds = opt_data.get_bounds()

print("Core optimization interface test:")
print(f"  Optimization vector length: {len(current_vector)}")
print(f"  Bounds vector length: {len(min_bounds)}")
print(f"  Sample headways: {current_vector[:12].round(1)}")
print(f"  Sample bounds: [{min_bounds[0]:.1f}, {max_bounds[0]:.1f}]")
print()

# Test vehicle constraint functions
vehicles_needed = opt_data.calculate_vehicles_needed(current_vector)
print("Vehicle constraint test:")
print(f"  Current vehicles by interval: {vehicles_needed.round(1)}")
print(f"  Peak vehicles needed: {np.max(vehicles_needed):.1f}")
print(f"  20% increase constraint: {opt_data.vehicle_constraint_percent_increase(current_vector, 20.0):.1f}")
print(f"  Total fleet constraint (100 vehicles): {opt_data.vehicle_constraint_total(current_vector, 100.0):.1f}")

=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE ===
=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===
1. Extracting GTFS data...
Extracting GTFS data with 3-hour intervals...
Found 358 routes with headway data
⚠️  Failed to calculate round-trip time for 11855: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11855: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 1

In [47]:
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from datetime import datetime, timedelta
from abc import ABC, abstractmethod

# =============================================================================
# CORE DATA STRUCTURES
# =============================================================================

@dataclass
class RouteConfig:
    """
    Immutable configuration for a single transit route/service.
    
    This class represents a single transit route with time-varying headways
    across different periods of the day. Each route is divided into time
    intervals (e.g., 3-hour windows) with potentially different headway values.
    
    Attributes:
        service_id (str): Unique GTFS service identifier
        route_name (str): Human-readable route name for display
        agency_id (str): Transit agency that operates this route
        headways_by_interval (np.ndarray): Headway in minutes for each time interval
                                         (np.nan indicates no service in that interval)
        min_headway (float): Minimum operationally feasible headway (minutes)
        max_headway (float): Maximum operationally reasonable headway (minutes)
        operating_hours (Tuple[int, int]): Typical service hours (start_hour, end_hour)
        route_color (str): Hex color code for visualization (without #)
        interval_hours (int): Duration of each time interval in hours
        round_trip_time (float): Complete round-trip time in minutes (for vehicle calculations)
    
    Example:
        A route with 3-hour intervals might have:
        headways_by_interval = [30, 15, 15, 20, 30, 45, np.nan, np.nan]
        This means: 30min headway 00-03h, 15min 03-06h, ..., no service 18-24h
    """
    service_id: str
    route_name: str
    agency_id: str
    headways_by_interval: np.ndarray
    min_headway: float
    max_headway: float
    operating_hours: Tuple[int, int]
    route_color: str
    interval_hours: int
    round_trip_time: float

# =============================================================================
# GTFS DATA EXTRACTION LAYER
# =============================================================================

class GTFSDataExtractor:
    """
    Handles extraction and processing of transit data from GTFS feeds via Genet.
    
    This class is responsible for:
    1. Extracting headway statistics from GTFS data
    2. Calculating time-varying headways across different time periods
    3. Computing route metadata (names, agencies, colors)
    4. Estimating round-trip times from trip data
    5. Handling fallback scenarios when data is incomplete
    
    The extraction process works as follows:
    1. Use genet's headway_stats() to get basic route information
    2. For each route, analyze trips_to_dataframe() to calculate time-varying headways
    3. Split the day into intervals (e.g., 3-hour windows)
    4. Calculate average headway within each interval based on actual trip departures
    5. Handle edge cases (no trips, single trip, data errors)
    """
    
    def __init__(self, 
                 genet_feed,
                 gtfs_day: str = "20230814",
                 interval_hours: int = 3,
                 fallback_headway: float = 30.0,
                 default_round_trip_time: float = 60.0):
        """
        Initialize the GTFS data extractor.
        
        Args:
            genet_feed: Genet feed object containing GTFS data
            gtfs_day: Date for analysis in YYYYMMDD format
            interval_hours: Duration of each time interval (must divide 24 evenly)
            fallback_headway: Default headway when calculation fails
            default_round_trip_time: Default round-trip time when calculation fails
        """
        if 24 % interval_hours != 0:
            raise ValueError(f"interval_hours ({interval_hours}) must divide 24 evenly")
            
        self.feed = genet_feed
        self.gtfs_day = gtfs_day
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.fallback_headway = fallback_headway
        self.default_round_trip_time = default_round_trip_time
    
    def extract_all_routes(self) -> List[Dict]:
        """
        Extract headway and metadata for all routes in the GTFS feed.
        
        This is the main entry point for data extraction. It:
        1. Gets basic headway statistics from genet
        2. Filters to valid routes (those with meaningful headway data)
        3. For each route, calculates detailed time-varying headways
        4. Extracts metadata (names, agencies, colors)
        5. Estimates round-trip times for vehicle calculations
        
        Returns:
            List[Dict]: List of dictionaries containing route data
                       Each dict has keys: service_id, route_id, avg_headway,
                       headways_by_interval, route_name, agency_id, route_color,
                       round_trip_time
        
        Raises:
            Exception: If extraction fails completely, falls back to simple uniform headways
        """
        try:
            print(f"Extracting GTFS data with {self.interval_hours}-hour intervals...")
            
            # Step 1: Get basic headway statistics from genet
            headway_df = self._get_basic_headway_stats()
            
            # Step 2: Process each route to get detailed time-varying data
            route_data_list = []
            for _, row in headway_df.iterrows():
                route_data = self._process_single_route(row)
                if route_data:  # Only add if processing succeeded
                    route_data_list.append(route_data)
            
            print(f"✅ Successfully extracted {len(route_data_list)} routes")
            return route_data_list
            
        except Exception as e:
            print(f"❌ GTFS extraction failed: {e}")
            print("Using fallback uniform headways...")
            return self._create_fallback_routes()
    
    def _get_basic_headway_stats(self) -> pd.DataFrame:
        """
        Get basic headway statistics from the genet feed.
        
        Uses genet's built-in headway_stats() method to get daily average
        headways for all routes. Filters out routes without valid headway data.
        
        Returns:
            pd.DataFrame: Routes with columns including service_id, route_id, 
                         mean_headway_mins, trip_count
        """
        headway_df = self.feed.headway_stats(gtfs_day=self.gtfs_day)
        
        # Include ALL routes with non-null headways (no arbitrary filtering)
        valid_routes = headway_df[headway_df['mean_headway_mins'].notna()].copy()
        
        print(f"Found {len(valid_routes)} routes with headway data")
        return valid_routes
    
    def _process_single_route(self, row: pd.Series) -> Optional[Dict]:
        """
        Process a single route to extract detailed time-varying headway data.
        
        For each route, this method:
        1. Calculates time-varying headways by analyzing trip departure times
        2. Extracts route metadata (name, agency, color)
        3. Estimates round-trip time from trip durations
        4. Packages everything into a dictionary for later processing
        
        Args:
            row: Pandas Series containing basic route info from headway_stats
            
        Returns:
            Dict or None: Route data dictionary, or None if processing failed
        """
        try:
            service_id = row['service_id']
            route_id = row.get('route_id', service_id)
            avg_headway = row['mean_headway_mins']
            
            # Calculate time-varying headways (core functionality)
            headways_by_interval = self._calculate_time_varying_headways(service_id, avg_headway)
            
            # Extract metadata
            route_name, agency_id, route_color = self._extract_route_metadata(route_id)
            
            # Calculate round-trip time for vehicle planning
            round_trip_time = self._calculate_round_trip_time(service_id)
            
            return {
                'service_id': service_id,
                'route_id': route_id,
                'avg_headway': avg_headway,
                'headways_by_interval': headways_by_interval,
                'route_name': route_name,
                'agency_id': agency_id,
                'route_color': route_color,
                'round_trip_time': round_trip_time
            }
            
        except Exception as e:
            print(f"⚠️  Failed to process route {row.get('service_id', 'unknown')}: {e}")
            return None
    
    def _calculate_time_varying_headways(self, service_id: str, avg_headway: float) -> np.ndarray:
        """
        Calculate headway values for each time interval throughout the day.
        
        This is the core algorithm for time-varying headway calculation:
        
        1. Get all trips for this service from GTFS data
        2. Extract departure hour from each trip
        3. For each time interval (e.g., 0-3h, 3-6h, etc.):
           a. Find all trips departing in that interval
           b. Sort trips by departure time
           c. Calculate time differences between consecutive trips
           d. Average these intervals to get headway for that period
        4. Handle special cases:
           - No trips → np.nan (no service)
           - One trip → interval duration (infrequent service)
           - Multiple trips → calculated average interval
        
        Args:
            service_id: GTFS service identifier
            avg_headway: Daily average headway as fallback
            
        Returns:
            np.ndarray: Headway values for each time interval (np.nan = no service)
            
        Example:
            For 3-hour intervals, returns array of length 8:
            [headway_00-03h, headway_03-06h, ..., headway_21-24h]
        """
        headways = np.full(self.n_intervals, np.nan)
        
        try:
            # Get trip data for this service
            trips_df = self.feed.trips_to_dataframe(gtfs_day=self.gtfs_day)
            service_trips = trips_df[trips_df['service_id'] == service_id].copy()
            
            # Handle edge cases
            if len(service_trips) == 0:
                return headways  # All np.nan - no service
            elif len(service_trips) == 1:
                # Single trip - assume it repeats every interval
                headways.fill(self.interval_hours * 60)  # Convert hours to minutes
                return headways
            
            # Extract hour component for interval assignment
            service_trips['departure_hour'] = service_trips['trip_departure_time'].dt.hour
            
            # Calculate headway for each time interval
            for interval in range(self.n_intervals):
                start_hour = interval * self.interval_hours
                end_hour = (interval + 1) * self.interval_hours
                
                # Find trips in this time window
                interval_trips = service_trips[
                    (service_trips['departure_hour'] >= start_hour) &
                    (service_trips['departure_hour'] < end_hour)
                ].copy()
                
                # Calculate headway for this specific interval
                headways[interval] = self._calculate_interval_headway(interval_trips, avg_headway)
            
            return headways
            
        except Exception as e:
            print(f"Failed to calculate time-varying headways for {service_id}: {e}")
            # Fallback: uniform headway across all intervals
            headways.fill(avg_headway)
            return headways
    
    def _calculate_interval_headway(self, interval_trips: pd.DataFrame, avg_headway: float) -> float:
        """
        Calculate headway for a specific time interval based on trip departures.
        
        This method handles the actual headway calculation for a single time period:
        
        For multiple trips:
        1. Sort trips by departure time
        2. Calculate time difference to next trip for each trip
        3. Filter out invalid intervals (≤0, outliers)
        4. Return average of valid intervals
        
        For single trip:
        - Return interval duration (assumes infrequent service)
        
        For no trips:
        - Return np.nan (no service)
        
        Args:
            interval_trips: DataFrame of trips in this time interval
            avg_headway: Fallback headway if calculation fails
            
        Returns:
            float: Calculated headway in minutes, or np.nan for no service
        """
        if len(interval_trips) >= 2:
            # Multiple trips - calculate actual intervals
            interval_trips = interval_trips.sort_values('trip_departure_time')
            
            # Calculate time to next departure for each trip
            interval_trips['next_departure'] = interval_trips['trip_departure_time'].shift(-1)
            interval_trips['interval_mins'] = (
                interval_trips['next_departure'] - interval_trips['trip_departure_time']
            ).dt.total_seconds() / 60
            
            # Filter valid intervals (positive, non-zero)
            valid_intervals = interval_trips['interval_mins'].dropna()
            valid_intervals = valid_intervals[valid_intervals > 0]
            
            if len(valid_intervals) > 0:
                return valid_intervals.mean()
            else:
                return avg_headway  # Fallback if no valid intervals
                
        elif len(interval_trips) == 1:
            # Single trip - assume it represents infrequent service
            return self.interval_hours * 60  # Convert hours to minutes
        else:
            # No trips - no service in this interval
            return np.nan
    
    def _extract_route_metadata(self, route_id: str) -> Tuple[str, str, str]:
        """
        Extract route metadata from the GTFS feed.
        
        Attempts to get route name, agency, and color from the genet route object.
        Provides sensible defaults if data is missing.
        
        Args:
            route_id: GTFS route identifier
            
        Returns:
            Tuple[str, str, str]: (route_name, agency_id, route_color)
        """
        try:
            route = self.feed.route(route_id)
            route_name = getattr(route, 'route_short_name', str(route_id))
            agency_id = getattr(route, 'agency_id', 'Unknown')
            route_color = getattr(route, 'route_color', '000000')
        except Exception as e:
            print(f"⚠️  Could not extract metadata for route {route_id}: {e}")
            route_name = str(route_id)
            agency_id = 'Unknown'
            route_color = '000000'
            
        return route_name, agency_id, route_color
    
    def _calculate_round_trip_time(self, service_id: str) -> float:
        """
        Estimate round-trip time by analyzing trip durations.
        
        This calculation is important for vehicle scheduling:
        vehicles_needed = round_trip_time / headway
        
        Process:
        1. Get all trips for this service
        2. Calculate duration of each trip (end_time - start_time)
        3. Take median duration as representative one-way time
        4. Estimate round-trip as 2 × one-way + 10% buffer for turnaround
        
        Args:
            service_id: GTFS service identifier
            
        Returns:
            float: Estimated round-trip time in minutes
        """
        try:
            trips_df = self.feed.trips_to_dataframe(gtfs_day=self.gtfs_day)
            service_trips = trips_df[trips_df['service_id'] == service_id].copy()
            
            if len(service_trips) == 0:
                return self.default_round_trip_time
            
            # Calculate individual trip durations
            service_trips['trip_duration'] = (
                service_trips['trip_end_time'] - service_trips['trip_departure_time']
            ).dt.total_seconds() / 60  # Convert to minutes
            
            # Filter to valid durations
            valid_durations = service_trips['trip_duration'].dropna()
            valid_durations = valid_durations[valid_durations > 0]
            
            if len(valid_durations) > 0:
                # Use median one-way time (robust to outliers)
                one_way_time = valid_durations.median()
                # Round-trip = 2 × one-way + 10% buffer for turnaround time
                round_trip_time = one_way_time * 2 * 1.1
                return round_trip_time
            else:
                return self.default_round_trip_time
                
        except Exception as e:
            print(f"⚠️  Failed to calculate round-trip time for {service_id}: {e}")
            return self.default_round_trip_time
    
    def _create_fallback_routes(self) -> List[Dict]:
        """
        Create simple uniform headway routes when extraction fails.
        
        This ensures the system can still operate even if detailed GTFS
        analysis fails. Creates routes with uniform headways across all intervals.
        
        Returns:
            List[Dict]: Simple route data with uniform headways
        """
        services_list = list(self.feed.services())
        fallback_routes = []
        
        for service in services_list:
            headways = np.full(self.n_intervals, self.fallback_headway)
            
            route_data = {
                'service_id': service.id,
                'route_id': service.id,
                'avg_headway': self.fallback_headway,
                'headways_by_interval': headways,
                'route_name': getattr(service, 'route_short_name', service.id),
                'agency_id': getattr(service, 'agency_id', 'Unknown'),
                'route_color': getattr(service, 'route_color', '000000'),
                'round_trip_time': self.default_round_trip_time
            }
            
            fallback_routes.append(route_data)
        
        return fallback_routes

# =============================================================================
# OPTIMIZATION CONSTRAINT MANAGER
# =============================================================================

class OptimizationConstraints:
    """
    Manages optimization bounds and constraints for headway variables.
    
    This class handles:
    1. Setting bounds for individual route headway variables
    2. Enforcing user-specified global constraints (min/max headways)
    3. Calculating data-driven bounds based on existing service patterns
    4. Converting between different headway representations (np.nan ↔ large values)
    
    The constraint system works as follows:
    - Each route has per-interval headway variables
    - User can set global min/max headway limits
    - Bounds can also be calculated from existing data (e.g., 50%-200% of current)
    - No-service periods (np.nan) are converted to large values for optimization
    """
    
    def __init__(self,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 no_service_headway_value: float = 9999.0):
        """
        Initialize constraint manager with user preferences.
        
        Args:
            user_min_headway: Absolute minimum headway (minutes)
            user_max_headway: Absolute maximum headway (minutes)
            min_headway_multiplier: Factor for data-driven lower bounds
            max_headway_multiplier: Factor for data-driven upper bounds
            no_service_headway_value: Large value representing no service
        """
        self.user_min_headway = user_min_headway
        self.user_max_headway = user_max_headway
        self.min_headway_multiplier = min_headway_multiplier
        self.max_headway_multiplier = max_headway_multiplier
        self.no_service_headway_value = no_service_headway_value
    
    def calculate_route_bounds(self, headways_by_interval: np.ndarray) -> Tuple[float, float]:
        """
        Calculate optimization bounds for a single route.
        
        Combines user-specified constraints with data-driven bounds:
        1. Calculate data-driven bounds from existing headways
        2. Enforce user-specified global limits
        3. Ensure min ≤ max
        
        Args:
            headways_by_interval: Current headway values for the route
            
        Returns:
            Tuple[float, float]: (min_headway, max_headway) for this route
        """
        valid_headways = headways_by_interval[~np.isnan(headways_by_interval)]
        
        if len(valid_headways) > 0:
            # Calculate bounds based on existing data
            data_min = np.min(valid_headways) * self.min_headway_multiplier
            data_max = np.max(valid_headways) * self.max_headway_multiplier
            
            # Combine with user constraints (most restrictive wins)
            min_headway = max(self.user_min_headway, data_min)
            max_headway = max(min_headway, min(self.user_max_headway, data_max))
        else:
            # No existing service - use user bounds
            min_headway = self.user_min_headway
            max_headway = self.user_max_headway
            
        return min_headway, max_headway
    
    def create_optimization_bounds(self, routes: List[RouteConfig]) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create bounds arrays for all optimization variables.
        
        The optimization vector is structured as:
        [route1_interval1, route1_interval2, ..., route1_intervalN,
         route2_interval1, route2_interval2, ..., route2_intervalN, ...]
        
        This method creates corresponding min/max bound arrays.
        
        Args:
            routes: List of route configurations
            
        Returns:
            Tuple[np.ndarray, np.ndarray]: (min_bounds, max_bounds) arrays
        """
        min_bounds = []
        max_bounds = []
        
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                # All intervals use the same route-level bounds
                # (optimization can decide whether to provide service)
                min_bounds.append(route.min_headway)
                max_bounds.append(route.max_headway)
        
        return np.array(min_bounds), np.array(max_bounds)
    
    def convert_to_optimization_vector(self, routes: List[RouteConfig]) -> np.ndarray:
        """
        Convert route headways to flat optimization vector.
        
        Handles the np.nan → large value conversion needed for optimization.
        Most optimization algorithms can't handle NaN values, so we convert
        no-service periods to a large headway value.
        
        Args:
            routes: List of route configurations
            
        Returns:
            np.ndarray: Flattened headway vector for optimization
        """
        all_headways = []
        for route in routes:
            for headway in route.headways_by_interval:
                if np.isnan(headway):
                    all_headways.append(self.no_service_headway_value)
                else:
                    all_headways.append(headway)
        return np.array(all_headways)
    
    def convert_from_optimization_vector(self, headway_vector: np.ndarray, 
                                       routes: List[RouteConfig]) -> None:
        """
        Update route configurations from optimization vector.
        
        Converts large headway values back to np.nan and updates the
        route configurations in-place.
        
        Args:
            headway_vector: Optimized headway values
            routes: List of route configurations to update
        """
        expected_length = sum(len(route.headways_by_interval) for route in routes)
        if len(headway_vector) != expected_length:
            raise ValueError(f"Vector length mismatch: {len(headway_vector)} vs {expected_length}")
        
        idx = 0
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                new_headway = headway_vector[idx]
                
                # Convert large values back to no service
                if new_headway >= self.no_service_headway_value * 0.9:
                    route.headways_by_interval[interval] = np.nan
                else:
                    route.headways_by_interval[interval] = new_headway
                    
                idx += 1

# =============================================================================
# VEHICLE CONSTRAINT CALCULATOR
# =============================================================================

class VehicleConstraintCalculator:
    """
    Calculates vehicle requirements and enforces fleet constraints.
    
    This class handles the relationship between headways and vehicle needs:
    vehicles_needed = round_trip_time / headway
    
    Key constraints supported:
    1. Total fleet size limits
    2. Percentage increase limits (vs baseline)
    3. Per-agency fleet limits
    4. Time-interval specific limits
    
    The calculation works as follows:
    1. For each route and time interval, calculate vehicles needed
    2. Sum across routes to get total vehicles by interval
    3. Peak interval determines total fleet size needed
    4. Check against various constraint types
    """
    
    def __init__(self, routes: List[RouteConfig], no_service_threshold: float = 9999.0):
        """
        Initialize vehicle calculator with route data.
        
        Args:
            routes: List of route configurations
            no_service_threshold: Threshold above which headway = no service
        """
        self.routes = routes
        self.no_service_threshold = no_service_threshold
        self.n_intervals = len(routes[0].headways_by_interval) if routes else 0
        
        # Calculate baseline vehicle requirements
        self.baseline_vehicles_by_interval = self._calculate_baseline_vehicles()
        self.baseline_total_vehicles = np.max(self.baseline_vehicles_by_interval)
    
    def _calculate_baseline_vehicles(self) -> np.ndarray:
        """
        Calculate current vehicle requirements from existing headways.
        
        This establishes the baseline for percentage-based constraints.
        
        Returns:
            np.ndarray: Vehicles needed for each time interval (current schedule)
        """
        vehicles_by_interval = np.zeros(self.n_intervals)
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = route.headways_by_interval[interval]
                if not np.isnan(headway) and headway > 0:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
        
        return vehicles_by_interval
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        """
        Calculate vehicle requirements for each time interval.
        
        This is the core vehicle calculation:
        For each route and interval:
        - If service operates: vehicles = round_trip_time / headway
        - If no service: vehicles = 0
        Sum across all routes for each interval.
        
        Args:
            headway_vector: Flattened optimization vector
            
        Returns:
            np.ndarray: Total vehicles needed for each time interval
        """
        vehicles_by_interval = np.zeros(self.n_intervals)
        idx = 0
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                # Only count vehicles for active service
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
                
                idx += 1
        
        return vehicles_by_interval
    
    def constraint_total_fleet_size(self, headway_vector: np.ndarray, 
                                  max_fleet_size: float) -> float:
        """
        Constraint: total fleet size ≤ max_fleet_size.
        
        Args:
            headway_vector: Optimization vector
            max_fleet_size: Maximum allowed total vehicles
            
        Returns:
            float: Constraint value (≥ 0 for feasible solutions)
        """
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        return max_fleet_size - peak_vehicles
    
    def constraint_percentage_increase(self, headway_vector: np.ndarray, 
                                     max_increase_percent: float) -> float:
        """
        Constraint: fleet size increase ≤ max_increase_percent.
        
        Useful for budget-constrained optimization where you want to limit
        the increase in vehicle requirements relative to current operations.
        
        Args:
            headway_vector: Optimization vector
            max_increase_percent: Maximum allowed increase (e.g., 20.0 for 20%)
            
        Returns:
            float: Constraint value (≥ 0 for feasible solutions)
        """
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        
        max_allowed = self.baseline_total_vehicles * (1 + max_increase_percent / 100)
        return max_allowed - peak_vehicles
    
    def constraint_agency_specific(self, headway_vector: np.ndarray, 
                                 agency_limits: Dict[str, float]) -> List[float]:
        """
        Constraint: per-agency fleet limits.
        
        Useful when different agencies have different vehicle budgets
        or operational constraints.
        
        Args:
            headway_vector: Optimization vector
            agency_limits: Maximum vehicles per agency {agency_id: limit}
            
        Returns:
            List[float]: Constraint values for each agency (≥ 0 for feasible)
        """
        # Calculate vehicles by agency and interval
        agency_vehicles = {}
        idx = 0
        
        for route in self.routes:
            agency_id = route.agency_id
            if agency_id not in agency_vehicles:
                agency_vehicles[agency_id] = np.zeros(self.n_intervals)
            
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    agency_vehicles[agency_id][interval] += vehicles_needed
                
                idx += 1
        
        # Check constraints for each agency
        constraints = []
        for agency_id, limit in agency_limits.items():
            if agency_id in agency_vehicles:
                peak_vehicles = np.max(agency_vehicles[agency_id])
                constraints.append(limit - peak_vehicles)
            else:
                constraints.append(limit)  # No vehicles used = always feasible
        
        return constraints

# =============================================================================
# MAIN OPTIMIZATION DATA STRUCTURE
# =============================================================================

class HeadwayOptimizationData:
    """
    Main class that coordinates all components for headway optimization.
    
    This class brings together:
    - GTFS data extraction (GTFSDataExtractor)
    - Route configuration management (RouteConfig)
    - Optimization constraints (OptimizationConstraints)
    - Vehicle calculations (VehicleConstraintCalculator)
    
    Responsibilities:
    1. Initialize and coordinate all sub-components
    2. Provide a clean interface for optimization algorithms
    3. Handle data conversion between different representations
    4. Generate summary reports and diagnostics
    
    Usage:
        opt_data = HeadwayOptimizationData(feed, gtfs_day="20230814")
        
        # Get data for optimization
        x = opt_data.get_optimization_vector()
        bounds = opt_data.get_bounds()
        
        # Use with optimization algorithm
        result = optimizer.minimize(objective, x, bounds=bounds)
        
        # Update with results
        opt_data.set_headways(result.x)
    """
    
    def __init__(self, 
                 genet_feed, 
                 gtfs_day: str = "20230814", 
                 interval_hours: int = 3,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 default_operating_hours: Tuple[int, int] = (6, 22),
                 fallback_headway: float = 30.0,
                 no_service_headway_value: float = 9999.0,
                 default_round_trip_time: float = 60.0):
        """
        Initialize the complete optimization data structure.
        
        This constructor coordinates the initialization of all sub-components
        and builds the complete data structure needed for optimization.
        
        Args:
            genet_feed: Genet feed object containing GTFS data
            gtfs_day: Analysis date in YYYYMMDD format
            interval_hours: Duration of each time interval (must divide 24 evenly)
            user_min_headway: Absolute minimum headway constraint (minutes)
            user_max_headway: Absolute maximum headway constraint (minutes)
            min_headway_multiplier: Factor for data-driven lower bounds
            max_headway_multiplier: Factor for data-driven upper bounds
            default_operating_hours: Default service hours (start, end)
            fallback_headway: Default headway when calculation fails
            no_service_headway_value: Large value representing no service
            default_round_trip_time: Default round-trip time (minutes)
        """
        # Store configuration
        self.gtfs_day = gtfs_day
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.default_operating_hours = default_operating_hours
        
        # Initialize sub-components
        print("=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===")
        
        # 1. GTFS Data Extraction
        print("1. Extracting GTFS data...")
        self.extractor = GTFSDataExtractor(
            genet_feed=genet_feed,
            gtfs_day=gtfs_day,
            interval_hours=interval_hours,
            fallback_headway=fallback_headway,
            default_round_trip_time=default_round_trip_time
        )
        
        # 2. Extract route data
        route_data_list = self.extractor.extract_all_routes()
        
        # 3. Constraint Management
        print("2. Setting up optimization constraints...")
        self.constraints = OptimizationConstraints(
            user_min_headway=user_min_headway,
            user_max_headway=user_max_headway,
            min_headway_multiplier=min_headway_multiplier,
            max_headway_multiplier=max_headway_multiplier,
            no_service_headway_value=no_service_headway_value
        )
        
        # 4. Create RouteConfig objects
        print("3. Creating route configurations...")
        self.routes = []
        for route_data in route_data_list:
            min_headway, max_headway = self.constraints.calculate_route_bounds(
                route_data['headways_by_interval']
            )
            
            route_config = RouteConfig(
                service_id=route_data['service_id'],
                route_name=route_data['route_name'],
                agency_id=route_data['agency_id'],
                headways_by_interval=route_data['headways_by_interval'],
                min_headway=min_headway,
                max_headway=max_headway,
                operating_hours=default_operating_hours,
                route_color=route_data['route_color'],
                interval_hours=interval_hours,
                round_trip_time=route_data['round_trip_time']
            )
            
            self.routes.append(route_config)
        
        # 5. Vehicle Constraint Calculator
        print("4. Initializing vehicle constraint calculator...")
        self.vehicle_calculator = VehicleConstraintCalculator(
            routes=self.routes,
            no_service_threshold=no_service_headway_value
        )
        
        print(f"✅ Successfully initialized optimization data for {len(self.routes)} routes")
        print(f"   Baseline fleet size: {self.vehicle_calculator.baseline_total_vehicles:.1f} vehicles")
        print()
    
    # =============================================================================
    # CORE OPTIMIZATION INTERFACE
    # =============================================================================
    
    def get_optimization_vector(self) -> np.ndarray:
        """
        Get current headways as a flattened optimization vector.
        
        This is the main interface for optimization algorithms.
        Converts np.nan values to large numbers that optimizers can handle.
        
        Returns:
            np.ndarray: Flattened headway vector for optimization
        """
        return self.constraints.convert_to_optimization_vector(self.routes)
    
    def get_bounds(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get optimization bounds for all headway variables.
        
        Returns:
            Tuple[np.ndarray, np.ndarray]: (lower_bounds, upper_bounds)
        """
        return self.constraints.create_optimization_bounds(self.routes)
    
    def set_headways(self, headway_vector: np.ndarray):
        """
        Update route headways from optimization results.
        
        Args:
            headway_vector: Optimized headway values
        """
        self.constraints.convert_from_optimization_vector(headway_vector, self.routes)
    
    # =============================================================================
    # VEHICLE CONSTRAINT INTERFACE
    # =============================================================================
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        """Calculate vehicle requirements for each time interval."""
        return self.vehicle_calculator.calculate_vehicles_needed(headway_vector)
    
    def vehicle_constraint_total(self, headway_vector: np.ndarray, max_fleet_size: float) -> float:
        """Vehicle constraint: total fleet size ≤ max_fleet_size."""
        return self.vehicle_calculator.constraint_total_fleet_size(headway_vector, max_fleet_size)
    
    def vehicle_constraint_percent_increase(self, headway_vector: np.ndarray, 
                                          max_increase_percent: float) -> float:
        """Vehicle constraint: fleet size increase ≤ max_increase_percent."""
        return self.vehicle_calculator.constraint_percentage_increase(headway_vector, max_increase_percent)
    
    def vehicle_constraint_agency_specific(self, headway_vector: np.ndarray, 
                                         agency_limits: Dict[str, float]) -> List[float]:
        """Vehicle constraint: per-agency fleet limits."""
        return self.vehicle_calculator.constraint_agency_specific(headway_vector, agency_limits)
    
    # =============================================================================
    # SUMMARY AND REPORTING
    # =============================================================================
    
    def get_route_summary(self, include_interval_details: bool = True) -> pd.DataFrame:
        """
        Generate summary table of all routes with their configurations.
        
        Args:
            include_interval_details: Whether to include individual interval columns
            
        Returns:
            pd.DataFrame: Route summary with headway and constraint information
        """
        data = []
        for route in self.routes:
            active_headways = route.headways_by_interval[~np.isnan(route.headways_by_interval)]
            avg_headway = np.mean(active_headways) if len(active_headways) > 0 else np.nan
            
            row_data = {
                'service_id': route.service_id,
                'route_name': route.route_name,
                'agency': route.agency_id,
                'avg_headway': avg_headway,
                'min_headway': route.min_headway,
                'max_headway': route.max_headway,
                'intervals_with_service': np.sum(~np.isnan(route.headways_by_interval)),
                'route_color': route.route_color,
                'round_trip_time': route.round_trip_time
            }
            
            if include_interval_details:
                for i in range(self.n_intervals):
                    start_hour = i * self.interval_hours
                    end_hour = (i + 1) * self.interval_hours
                    headway_val = route.headways_by_interval[i]
                    row_data[f'headway_{start_hour:02d}-{end_hour:02d}h'] = headway_val
            
            data.append(row_data)
        
        return pd.DataFrame(data)
    
    def get_interval_labels(self) -> List[str]:
        """Get human-readable labels for time intervals."""
        labels = []
        for i in range(self.n_intervals):
            start_hour = i * self.interval_hours
            end_hour = (i + 1) * self.interval_hours
            labels.append(f"{start_hour:02d}-{end_hour:02d}h")
        return labels
    
    def get_optimization_info(self) -> Dict:
        """
        Get comprehensive information about the optimization setup.
        
        Returns:
            Dict: Detailed optimization configuration and statistics
        """
        current_vector = self.get_optimization_vector()
        valid_headways = current_vector[current_vector < self.constraints.no_service_headway_value * 0.9]
        no_service_count = len(current_vector) - len(valid_headways)
        
        return {
            'n_routes': len(self.routes),
            'n_intervals': self.n_intervals,
            'interval_hours': self.interval_hours,
            'total_variables': len(current_vector),
            'service_periods': len(valid_headways),
            'no_service_periods': no_service_count,
            'service_coverage_pct': 100 * len(valid_headways) / len(current_vector) if len(current_vector) > 0 else 0,
            'mean_headway': valid_headways.mean() if len(valid_headways) > 0 else np.nan,
            'min_headway': valid_headways.min() if len(valid_headways) > 0 else np.nan,
            'max_headway': valid_headways.max() if len(valid_headways) > 0 else np.nan,
            'std_headway': valid_headways.std() if len(valid_headways) > 0 else np.nan,
            'interval_labels': self.get_interval_labels(),
            'user_min_headway': self.constraints.user_min_headway,
            'user_max_headway': self.constraints.user_max_headway,
            'baseline_total_vehicles': self.vehicle_calculator.baseline_total_vehicles,
            'baseline_vehicles_by_interval': self.vehicle_calculator.baseline_vehicles_by_interval
        }

# =============================================================================
# EXAMPLE USAGE
# =============================================================================

# Create the optimization data structure
print("=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE ===")

opt_data = HeadwayOptimizationData(
    genet_feed=feed,
    gtfs_day="20230814",
    interval_hours=3,
    user_min_headway=5.0,
    user_max_headway=120.0,
    min_headway_multiplier=0.5,
    max_headway_multiplier=2.0,
    default_operating_hours=(6, 22),
    fallback_headway=30.0,
    no_service_headway_value=9999.0,
    default_round_trip_time=60.0
)

# Show core route summary
summary_df = opt_data.get_route_summary(include_interval_details=False)
print("Core route summary:")
print(summary_df[['service_id', 'route_name', 'agency', 'avg_headway', 'round_trip_time']].round(1).head(10))
print()

# Show optimization setup information
opt_info = opt_data.get_optimization_info()
print("Optimization setup:")
print(f"  Total decision variables: {opt_info['total_variables']} ({opt_info['n_routes']} routes × {opt_info['n_intervals']} intervals)")
print(f"  Time intervals: {opt_info['interval_labels']}")
print(f"  Service periods: {opt_info['service_periods']}/{opt_info['total_variables']} ({opt_info['service_coverage_pct']:.1f}%)")
print(f"  User constraints: {opt_info['user_min_headway']:.1f} - {opt_info['user_max_headway']:.1f} minutes")
print(f"  Baseline fleet size: {opt_info['baseline_total_vehicles']:.1f} vehicles")
print()

# Test the core optimization interface
current_vector = opt_data.get_optimization_vector()
min_bounds, max_bounds = opt_data.get_bounds()

print("Core optimization interface test:")
print(f"  Optimization vector length: {len(current_vector)}")
print(f"  Bounds vector length: {len(min_bounds)}")
print(f"  Sample headways: {current_vector[:12].round(1)}")
print(f"  Sample bounds: [{min_bounds[0]:.1f}, {max_bounds[0]:.1f}]")
print()

# Test vehicle constraint functions
vehicles_needed = opt_data.calculate_vehicles_needed(current_vector)
print("Vehicle constraint test:")
print(f"  Current vehicles by interval: {vehicles_needed.round(1)}")
print(f"  Peak vehicles needed: {np.max(vehicles_needed):.1f}")
print(f"  20% increase constraint: {opt_data.vehicle_constraint_percent_increase(current_vector, 20.0):.1f}")
print(f"  Total fleet constraint (100 vehicles): {opt_data.vehicle_constraint_total(current_vector, 100.0):.1f}")

=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE ===
=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===
1. Extracting GTFS data...
Extracting GTFS data with 3-hour intervals...
Found 358 routes with headway data
⚠️  Failed to calculate round-trip time for 11855: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11855: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11878: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 11896: 'trip_end_time'
⚠️  Failed to calculate round-trip time for 1

In [48]:
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List

def calculate_route_round_trip_time_from_stops(genet_feed, 
                                             service_id: str, 
                                             gtfs_day: str = "20230814",
                                             turnaround_buffer: float = 1.15,
                                             default_round_trip_time: float = 60.0,
                                             verbose: bool = True) -> Tuple[float, Dict]:
    """
    Calculate round-trip time for a route using actual stop times data.
    
    This method is much more accurate than using trip-level start/end times because:
    1. It uses actual scheduled stop times from stop_times.txt
    2. It calculates the full journey time from first to last stop
    3. It accounts for real operational patterns, not assumptions
    
    Args:
        genet_feed: Genet feed object
        service_id: GTFS service identifier  
        gtfs_day: Analysis date
        turnaround_buffer: Multiplier for round-trip (e.g., 1.15 = 15% buffer)
        default_round_trip_time: Fallback value if calculation fails
        verbose: Whether to print diagnostic information
        
    Returns:
        Tuple[float, Dict]: (calculated_round_trip_time, diagnostic_info)
    """
    
    diagnostics = {
        'service_id': service_id,
        'method': 'stop_times_analysis',
        'success': False,
        'trips_found': 0,
        'trips_with_valid_stops': 0,
        'valid_durations_count': 0,
        'median_trip_duration': None,
        'min_trip_duration': None,
        'max_trip_duration': None,
        'calculated_round_trip': None,
        'turnaround_buffer_used': turnaround_buffer,
        'fallback_used': False,
        'error_message': None
    }
    
    try:
        if verbose:
            print(f"\n=== CALCULATING ROUND-TRIP TIME FOR SERVICE {service_id} ===")
        
        # Step 1: Get all trips for this service
        trips_df = genet_feed.trips_to_dataframe(gtfs_day=gtfs_day)
        service_trips = trips_df[trips_df['service_id'] == service_id]
        diagnostics['trips_found'] = len(service_trips)
        
        if verbose:
            print(f"Found {len(service_trips)} trips for service {service_id}")
        
        if len(service_trips) == 0:
            diagnostics['error_message'] = 'No trips found for service'
            if verbose:
                print("❌ No trips found - using default")
            diagnostics['fallback_used'] = True
            return default_round_trip_time, diagnostics
        
        # Step 2: Get stop times data for all trips
        if verbose:
            print("Analyzing stop times for each trip...")
        
        trip_durations = []
        trips_processed = 0
        
        for trip_id in service_trips['trip_id']:
            try:
                # Get stop times for this specific trip
                stop_times_df = genet_feed.stop_times_to_dataframe(trip_ids=[trip_id])
                
                if len(stop_times_df) < 2:
                    if verbose:
                        print(f"  Trip {trip_id}: Only {len(stop_times_df)} stops - skipping")
                    continue
                
                # Sort by stop sequence to ensure correct order
                stop_times_df = stop_times_df.sort_values('stop_sequence')
                
                # Get first and last stop times
                first_stop_time = stop_times_df.iloc[0]['departure_time']
                last_stop_time = stop_times_df.iloc[-1]['arrival_time']
                
                # Calculate trip duration in minutes
                duration_minutes = (last_stop_time - first_stop_time).total_seconds() / 60
                
                if duration_minutes > 0:  # Valid duration
                    trip_durations.append(duration_minutes)
                    trips_processed += 1
                    
                    if verbose and trips_processed <= 3:  # Show first few for debugging
                        print(f"  Trip {trip_id}: {len(stop_times_df)} stops, "
                              f"{duration_minutes:.1f} minutes "
                              f"({first_stop_time.strftime('%H:%M')} → {last_stop_time.strftime('%H:%M')})")
                elif verbose:
                    print(f"  Trip {trip_id}: Invalid duration ({duration_minutes:.1f} min) - skipping")
                    
            except Exception as e:
                if verbose:
                    print(f"  Trip {trip_id}: Error processing stop times - {e}")
                continue
        
        diagnostics['trips_with_valid_stops'] = trips_processed
        diagnostics['valid_durations_count'] = len(trip_durations)
        
        if verbose:
            print(f"Successfully processed {trips_processed} trips with valid stop times")
        
        # Step 3: Calculate statistics and round-trip time
        if len(trip_durations) == 0:
            diagnostics['error_message'] = 'No valid trip durations found'
            if verbose:
                print("❌ No valid trip durations found - using default")
            diagnostics['fallback_used'] = True
            return default_round_trip_time, diagnostics
        
        # Calculate trip duration statistics
        trip_durations = np.array(trip_durations)
        median_duration = np.median(trip_durations)
        min_duration = np.min(trip_durations)
        max_duration = np.max(trip_durations)
        
        # Calculate round-trip time with buffer
        round_trip_time = median_duration * 2 * turnaround_buffer
        
        # Store diagnostics
        diagnostics['median_trip_duration'] = float(median_duration)
        diagnostics['min_trip_duration'] = float(min_duration)
        diagnostics['max_trip_duration'] = float(max_duration)
        diagnostics['calculated_round_trip'] = float(round_trip_time)
        diagnostics['success'] = True
        
        if verbose:
            print(f"\n📊 TRIP DURATION ANALYSIS:")
            print(f"  Valid trips analyzed: {len(trip_durations)}")
            print(f"  Trip duration range: {min_duration:.1f} - {max_duration:.1f} minutes")
            print(f"  Median trip duration: {median_duration:.1f} minutes")
            print(f"  Estimated round-trip: {median_duration:.1f} × 2 × {turnaround_buffer} = {round_trip_time:.1f} minutes")
            print(f"✅ Round-trip calculation successful!")
        
        return round_trip_time, diagnostics
        
    except Exception as e:
        diagnostics['error_message'] = str(e)
        diagnostics['fallback_used'] = True
        if verbose:
            print(f"❌ Error in round-trip calculation: {e}")
            print(f"Using default round-trip time: {default_round_trip_time} minutes")
        return default_round_trip_time, diagnostics


def test_round_trip_calculation(genet_feed, gtfs_day: str = "20230814", max_routes: int = 5):
    """
    Test the round-trip calculation on multiple routes to see how it works.
    """
    print("=== TESTING ROUND-TRIP TIME CALCULATION ===")
    
    # Get some routes to test
    headway_df = genet_feed.headway_stats(gtfs_day=gtfs_day)
    test_routes = headway_df.head(max_routes)
    
    results = []
    
    for _, route_info in test_routes.iterrows():
        service_id = route_info['service_id']
        avg_headway = route_info['mean_headway_mins']
        
        print(f"\n{'='*60}")
        print(f"TESTING SERVICE: {service_id}")
        print(f"Average headway: {avg_headway:.1f} minutes")
        
        # Calculate round-trip time
        round_trip_time, diagnostics = calculate_route_round_trip_time_from_stops(
            genet_feed, service_id, gtfs_day, verbose=True
        )
        
        # Calculate vehicles needed
        if diagnostics['success']:
            vehicles_needed = round_trip_time / avg_headway
            print(f"\n🚌 VEHICLE REQUIREMENT:")
            print(f"  Vehicles needed: {round_trip_time:.1f} ÷ {avg_headway:.1f} = {vehicles_needed:.1f} vehicles")
        
        results.append({
            'service_id': service_id,
            'avg_headway': avg_headway,
            'round_trip_time': round_trip_time,
            'vehicles_needed': round_trip_time / avg_headway if avg_headway > 0 else np.nan,
            'calculation_success': diagnostics['success'],
            'trips_analyzed': diagnostics['valid_durations_count'],
            'median_trip_duration': diagnostics['median_trip_duration']
        })
    
    # Summary table
    print(f"\n{'='*60}")
    print("SUMMARY OF ALL ROUTES TESTED:")
    results_df = pd.DataFrame(results)
    print(results_df.round(1).to_string(index=False))
    
    # Success rate
    success_rate = results_df['calculation_success'].mean() * 100
    print(f"\nCalculation success rate: {success_rate:.1f}%")
    
    return results_df

# TEST THE NEW CALCULATION
print("Testing the improved round-trip calculation method...")
results_df = test_round_trip_calculation(feed, gtfs_day="20230814", max_routes=5)

Testing the improved round-trip calculation method...
=== TESTING ROUND-TRIP TIME CALCULATION ===

TESTING SERVICE: 11855
Average headway: nan minutes

=== CALCULATING ROUND-TRIP TIME FOR SERVICE 11855 ===
Found 82 trips for service 11855
Analyzing stop times for each trip...
  Trip VJa79a2689eef92ae5e0f24ae32b324b43c8f58ced: Error processing stop times - 'Schedule' object has no attribute 'stop_times_to_dataframe'
  Trip VJ039542b7a26132c6c50b77f760fa8aa838b1fafd: Error processing stop times - 'Schedule' object has no attribute 'stop_times_to_dataframe'
  Trip VJ06f4e90aa3ce001a40260ef60fa9ad728b9a2020: Error processing stop times - 'Schedule' object has no attribute 'stop_times_to_dataframe'
  Trip VJ0c500e46af16ae241e984cf66e755390d079bbde: Error processing stop times - 'Schedule' object has no attribute 'stop_times_to_dataframe'
  Trip VJ0ea68314b6135a0907a563ac9bfdb0005b8a3f23: Error processing stop times - 'Schedule' object has no attribute 'stop_times_to_dataframe'
  Trip VJ11b2

In [54]:
def calculate_route_round_trip_time_fixed(genet_feed, 
                                        service_id: str, 
                                        gtfs_day: str = "20230814",
                                        turnaround_buffer: float = 1.15,
                                        default_round_trip_time: float = 60.0,
                                        verbose: bool = True) -> Tuple[float, Dict]:
    """
    Calculate round-trip time using genet's trips_with_stops_to_dataframe method.
    This method provides stop-level timing data for accurate calculations.
    """
    
    diagnostics = {
        'service_id': service_id,
        'method': 'trips_with_stops_analysis',
        'success': False,
        'trips_found': 0,
        'trips_with_valid_stops': 0,
        'valid_durations_count': 0,
        'median_trip_duration': None,
        'calculated_round_trip': None,
        'turnaround_buffer_used': turnaround_buffer,
        'fallback_used': False,
        'error_message': None
    }
    
    try:
        if verbose:
            print(f"\n=== CALCULATING ROUND-TRIP TIME FOR SERVICE {service_id} ===")
        
        # Step 1: Get trips with stops data
        trips_with_stops_df = genet_feed.trips_with_stops_to_dataframe(gtfs_day=gtfs_day)
        
        if verbose:
            print(f"Retrieved trips_with_stops data: {len(trips_with_stops_df)} records")
            if len(trips_with_stops_df) > 0:
                print(f"Columns available: {list(trips_with_stops_df.columns)}")
        
        # Step 2: Filter to our service
        service_data = trips_with_stops_df[trips_with_stops_df['service_id'] == service_id]
        unique_trips = service_data['trip_id'].unique() if len(service_data) > 0 else []
        diagnostics['trips_found'] = len(unique_trips)
        
        if verbose:
            print(f"Found {len(service_data)} stop records for service {service_id}")
            print(f"Covering {len(unique_trips)} unique trips")
        
        if len(service_data) == 0:
            diagnostics['error_message'] = 'No stop data found for service'
            if verbose:
                print("❌ No stop data found - using default")
            diagnostics['fallback_used'] = True
            return default_round_trip_time, diagnostics
        
        # Step 3: Calculate trip durations
        trip_durations = []
        
        for trip_id in unique_trips:
            trip_stops = service_data[service_data['trip_id'] == trip_id].copy()
            
            if len(trip_stops) < 2:
                if verbose and len(trip_durations) < 3:
                    print(f"  Trip {trip_id}: Only {len(trip_stops)} stops - skipping")
                continue
                
            # Sort by departure time to get chronological order
            trip_stops = trip_stops.sort_values('departure_time')
            
            # Get first and last stop times
            first_stop = trip_stops.iloc[0]
            last_stop = trip_stops.iloc[-1]
            
            # Use departure_time for first stop, arrival_time for last stop
            start_time = first_stop['departure_time']
            end_time = last_stop['arrival_time']
            
            # Calculate duration in minutes
            duration_minutes = (end_time - start_time).total_seconds() / 60
            
            if duration_minutes > 0:
                trip_durations.append(duration_minutes)
                
                if verbose and len(trip_durations) <= 3:  # Show first few for debugging
                    print(f"  Trip {trip_id}: {len(trip_stops)} stops, "
                          f"{duration_minutes:.1f} minutes "
                          f"({start_time.strftime('%H:%M')} → {end_time.strftime('%H:%M')})")
            elif verbose and len(trip_durations) < 3:
                print(f"  Trip {trip_id}: Invalid duration ({duration_minutes:.1f} min) - skipping")
        
        diagnostics['trips_with_valid_stops'] = len(trip_durations)
        diagnostics['valid_durations_count'] = len(trip_durations)
        
        if verbose:
            print(f"Successfully calculated durations for {len(trip_durations)} trips")
        
        # Step 4: Calculate round-trip time
        if len(trip_durations) == 0:
            diagnostics['error_message'] = 'No valid trip durations calculated'
            if verbose:
                print("❌ No valid trip durations - using default")
            diagnostics['fallback_used'] = True
            return default_round_trip_time, diagnostics
        
        # Use median duration (robust to outliers) and apply turnaround buffer
        trip_durations_array = np.array(trip_durations)
        median_duration = np.median(trip_durations_array)
        round_trip_time = median_duration * 2 * turnaround_buffer
        
        diagnostics['median_trip_duration'] = float(median_duration)
        diagnostics['calculated_round_trip'] = float(round_trip_time)
        diagnostics['success'] = True
        
        if verbose:
            print(f"\n📊 TRIP DURATION ANALYSIS:")
            print(f"  Valid trips analyzed: {len(trip_durations)}")
            print(f"  Duration range: {np.min(trip_durations_array):.1f} - {np.max(trip_durations_array):.1f} minutes")
            print(f"  Median trip duration: {median_duration:.1f} minutes")
            print(f"  Round-trip calculation: {median_duration:.1f} × 2 × {turnaround_buffer} = {round_trip_time:.1f} minutes")
            print(f"✅ Round-trip calculation successful!")
        
        return round_trip_time, diagnostics
        
    except Exception as e:
        diagnostics['error_message'] = str(e)
        diagnostics['fallback_used'] = True
        if verbose:
            print(f"❌ Error in round-trip calculation: {e}")
            print(f"Using default round-trip time: {default_round_trip_time} minutes")
        return default_round_trip_time, diagnostics


def test_fixed_round_trip_calculation(genet_feed, gtfs_day: str = "20230814", max_routes: int = 5):
    """
    Test the fixed round-trip calculation method.
    """
    print("=== TESTING FIXED ROUND-TRIP TIME CALCULATION ===")
    
    # Get some routes to test
    headway_df = genet_feed.headway_stats(gtfs_day=gtfs_day)
    test_routes = headway_df.head(max_routes)
    
    results = []
    
    for _, route_info in test_routes.iterrows():
        service_id = route_info['service_id']
        avg_headway = route_info['mean_headway_mins']
        
        print(f"\n{'='*60}")
        print(f"TESTING SERVICE: {service_id}")
        print(f"Average headway: {avg_headway:.1f} minutes")
        
        # Calculate round-trip time using the fixed method
        round_trip_time, diagnostics = calculate_route_round_trip_time_fixed(
            genet_feed, service_id, gtfs_day, verbose=True
        )
        
        # Calculate vehicles needed
        if diagnostics['success'] and avg_headway > 0:
            vehicles_needed = round_trip_time / avg_headway
            print(f"\n🚌 VEHICLE REQUIREMENT:")
            print(f"  Vehicles needed: {round_trip_time:.1f} ÷ {avg_headway:.1f} = {vehicles_needed:.1f} vehicles")
        
        results.append({
            'service_id': service_id,
            'avg_headway': avg_headway,
            'round_trip_time': round_trip_time,
            'vehicles_needed': round_trip_time / avg_headway if avg_headway > 0 else np.nan,
            'calculation_success': diagnostics['success'],
            'trips_analyzed': diagnostics['valid_durations_count'],
            'median_trip_duration': diagnostics['median_trip_duration'],
            'error_message': diagnostics['error_message']
        })
    
    # Summary table
    print(f"\n{'='*60}")
    print("SUMMARY OF ALL ROUTES TESTED:")
    results_df = pd.DataFrame(results)
    display_columns = ['service_id', 'avg_headway', 'round_trip_time', 'vehicles_needed', 
                      'calculation_success', 'trips_analyzed', 'median_trip_duration']
    print(results_df[display_columns].round(1).to_string(index=False))
    
    # Show errors if any
    failed_routes = results_df[~results_df['calculation_success']]
    if len(failed_routes) > 0:
        print(f"\nFailed routes:")
        for _, row in failed_routes.iterrows():
            print(f"  {row['service_id']}: {row['error_message']}")
    
    # Success rate
    success_rate = results_df['calculation_success'].mean() * 100
    print(f"\nCalculation success rate: {success_rate:.1f}%")
    
    return results_df


# TEST THE FIXED CALCULATION
print("Testing the corrected round-trip calculation method...")
results_df = test_fixed_round_trip_calculation(feed, gtfs_day="20230814", max_routes=3)

Testing the corrected round-trip calculation method...
=== TESTING FIXED ROUND-TRIP TIME CALCULATION ===

TESTING SERVICE: 11855
Average headway: nan minutes

=== CALCULATING ROUND-TRIP TIME FOR SERVICE 11855 ===
Retrieved trips_with_stops data: 294071 records
Columns available: ['from_stop', 'arrival_time', 'service_name', 'to_stop_name', 'to_stop', 'route_id', 'service_id', 'departure_time', 'mode', 'from_stop_name', 'route_name', 'trip_id', 'vehicle_id']
Found 6512 stop records for service 11855
Covering 82 unique trips
  Trip VJa79a2689eef92ae5e0f24ae32b324b43c8f58ced: 39 stops, 29.0 minutes (05:33 → 06:02)
  Trip VJ039542b7a26132c6c50b77f760fa8aa838b1fafd: 82 stops, 80.0 minutes (16:45 → 18:05)
  Trip VJ06f4e90aa3ce001a40260ef60fa9ad728b9a2020: 82 stops, 80.0 minutes (09:53 → 11:13)
Successfully calculated durations for 82 trips

📊 TRIP DURATION ANALYSIS:
  Valid trips analyzed: 82
  Duration range: 27.0 - 100.0 minutes
  Median trip duration: 80.0 minutes
  Round-trip calculation

In [None]:
import pandas as pd
import numpy as np
import gtfs_kit as gk
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from datetime import datetime, timedelta
from abc import ABC, abstractmethod
import time

# GOOD
# =============================================================================
# CORE DATA STRUCTURES
# =============================================================================

@dataclass
class RouteConfig:
    """
    Immutable configuration for a single transit route/service.
    """
    service_id: str
    route_name: str
    agency_id: str
    headways_by_interval: np.ndarray
    min_headway: float
    max_headway: float
    operating_hours: Tuple[int, int]
    route_color: str
    interval_hours: int
    round_trip_time: float

# =============================================================================
# GTFS DATA EXTRACTION LAYER - FIXED TIME CONVERSION
# =============================================================================

class GTFSDataExtractor:
    """
    Handles extraction and processing of transit data from GTFS feeds via gtfs_kit.
    
    FIXED VERSION with proper time string conversion.
    """
    
    def __init__(self, 
                 gtfs_path: str,
                 date: str = "20230814",
                 interval_hours: int = 3,
                 fallback_headway: float = 30.0,
                 default_round_trip_time: float = 60.0):
        """
        Initialize the GTFS data extractor.
        """
        if 24 % interval_hours != 0:
            raise ValueError(f"interval_hours ({interval_hours}) must divide 24 evenly")
            
        self.gtfs_path = gtfs_path
        self.date = date
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.fallback_headway = fallback_headway
        self.default_round_trip_time = default_round_trip_time
        
        # Load GTFS feed with timing
        print(f"⏱️  Loading GTFS feed from {gtfs_path}...")
        start_time = time.time()
        self.feed = gk.read_feed(gtfs_path, dist_units='km')
        load_time = time.time() - start_time
        print(f"✅ GTFS feed loaded in {load_time:.2f} seconds")
        
        # Pre-load and cache frequently used data
        print("⏱️  Pre-loading GTFS tables...")
        start_time = time.time()
        self._preload_data()
        preload_time = time.time() - start_time
        print(f"✅ Data pre-loaded in {preload_time:.2f} seconds")
    
    def _preload_data(self):
        """Pre-load and cache GTFS data with FIXED time conversion."""
        # Cache the main tables we'll use repeatedly
        self.trips_df = self.feed.trips.copy()
        self.stop_times_df = self.feed.stop_times.copy()
        self.routes_df = self.feed.routes.copy()
        
        print(f"   📊 Loaded {len(self.trips_df):,} trips")
        print(f"   📊 Loaded {len(self.stop_times_df):,} stop times")
        print(f"   📊 Loaded {len(self.routes_df):,} routes")
        
        # FIXED: Proper time conversion
        print("   🔄 Converting departure times...")
        start_time = time.time()
        
        # Convert time strings to seconds using gtfs_kit's helper function
        print(f"   🔍 Departure time column type: {self.stop_times_df['departure_time'].dtype}")
        print(f"   🔍 Sample departure times: {self.stop_times_df['departure_time'].head(3).tolist()}")
        
        # Always convert using gtfs_kit's timestr_to_seconds function
        def safe_timestr_to_seconds(time_str):
            try:
                if pd.isna(time_str):
                    return np.nan
                if isinstance(time_str, str):
                    return gk.helpers.timestr_to_seconds(time_str)
                else:
                    # If it's already a number, assume it's seconds
                    return float(time_str)
            except Exception:
                return np.nan
        
        self.stop_times_df['departure_seconds'] = self.stop_times_df['departure_time'].apply(safe_timestr_to_seconds)
        self.stop_times_df['arrival_seconds'] = self.stop_times_df['arrival_time'].apply(safe_timestr_to_seconds)
        
        # Check conversion success
        valid_departures = self.stop_times_df['departure_seconds'].notna().sum()
        total_departures = len(self.stop_times_df)
        print(f"   ✅ Converted {valid_departures:,}/{total_departures:,} departure times ({100*valid_departures/total_departures:.1f}%)")
        
        convert_time = time.time() - start_time
        print(f"   ✅ Time conversion completed in {convert_time:.2f} seconds")
    
    def extract_all_routes(self) -> List[Dict]:
        """
        Extract headway and metadata for all routes in the GTFS feed.
        """
        try:
            print(f"⏱️  Extracting GTFS data with {self.interval_hours}-hour intervals...")
            total_start = time.time()
            
            # Get all unique services
            all_services = self.trips_df['service_id'].unique()
            print(f"📊 Found {len(all_services)} unique services to process")
            
            # Process each service with progress tracking
            route_data_list = []
            successful_extractions = 0
            failed_extractions = 0
            
            for i, service_id in enumerate(all_services):
                if i % 50 == 0:  # Progress update every 50 services (less frequent)
                    elapsed = time.time() - total_start
                    progress_pct = (i / len(all_services)) * 100
                    if i > 0:
                        print(f"   📈 Progress: {i}/{len(all_services)} ({progress_pct:.1f}%) - "
                              f"Elapsed: {elapsed:.1f}s - "
                              f"Success rate: {successful_extractions}/{i} ({100*successful_extractions/i:.1f}%)")
                
                route_data = self._process_single_service(service_id)
                
                if route_data:  # Only add if processing succeeded
                    route_data_list.append(route_data)
                    successful_extractions += 1
                else:
                    failed_extractions += 1
            
            total_time = time.time() - total_start
            print(f"✅ Successfully extracted {len(route_data_list)} routes in {total_time:.2f} seconds")
            print(f"   📊 Success rate: {successful_extractions}/{len(all_services)} ({100*successful_extractions/len(all_services):.1f}%)")
            print(f"   ⚡ Average time per service: {total_time/len(all_services):.3f}s")
            
            if failed_extractions > 0:
                print(f"   ⚠️  {failed_extractions} services failed processing")
            
            return route_data_list
            
        except Exception as e:
            print(f"❌ GTFS extraction failed: {e}")
            print("Using fallback uniform headways...")
            return self._create_fallback_routes()
    
    def _process_single_service(self, service_id: str) -> Optional[Dict]:
        """
        Process a single service to extract detailed time-varying headway data.
        """
        try:
            # Get trips for this service
            service_trips = self.trips_df[self.trips_df['service_id'] == service_id]
            
            if len(service_trips) == 0:
                return None
            
            # Calculate average headway and time-varying headways
            avg_headway = self._calculate_average_headway(service_id, service_trips)
            headways_by_interval = self._calculate_time_varying_headways(service_id, service_trips, avg_headway)
            
            # Extract metadata from first trip's route
            first_trip = service_trips.iloc[0]
            route_id = first_trip['route_id']
            route_name, agency_id, route_color = self._extract_route_metadata(route_id)
            
            # Calculate round-trip time
            round_trip_time = self._calculate_round_trip_time(service_id, service_trips)
            
            return {
                'service_id': service_id,
                'route_id': route_id,
                'avg_headway': avg_headway,
                'headways_by_interval': headways_by_interval,
                'route_name': route_name,
                'agency_id': agency_id,
                'route_color': route_color,
                'round_trip_time': round_trip_time
            }
            
        except Exception as e:
            # Only print errors for first few services to avoid spam
            if len([s for s in self.trips_df['service_id'].unique()][:10]) and service_id in self.trips_df['service_id'].unique()[:10]:
                print(f"⚠️  Failed to process service {service_id}: {e}")
            return None
    
    def _calculate_average_headway(self, service_id: str, service_trips: pd.DataFrame) -> float:
        """
        Calculate average headway for a service across the entire day.
        FIXED VERSION - handles numeric departure times properly.
        """
        try:
            if len(service_trips) <= 1:
                return self.fallback_headway
            
            # Get departure times from pre-loaded stop_times using trip_ids
            trip_ids = service_trips['trip_id'].tolist()
            
            # Use vectorized operations instead of loops
            service_stop_times = self.stop_times_df[
                self.stop_times_df['trip_id'].isin(trip_ids)
            ].copy()
            
            if len(service_stop_times) == 0:
                return self.fallback_headway
            
            # Get first departure for each trip (minimum stop_sequence)
            first_departures = service_stop_times.loc[
                service_stop_times.groupby('trip_id')['stop_sequence'].idxmin()
            ]
            
            # Extract departure times in seconds (now properly converted)
            departure_times = first_departures['departure_seconds'].dropna().values
            
            if len(departure_times) <= 1:
                return self.fallback_headway
            
            # Sort and calculate intervals
            departure_times = np.sort(departure_times)
            intervals = np.diff(departure_times) / 60  # Convert to minutes
            
            # Filter reasonable intervals (avoid overnight gaps)
            reasonable_intervals = intervals[(intervals >= 1) & (intervals <= 180)]
            
            if len(reasonable_intervals) > 0:
                return np.mean(reasonable_intervals)
            else:
                return self.fallback_headway
                
        except Exception as e:
            return self.fallback_headway
    
    def _calculate_time_varying_headways(self, service_id: str, service_trips: pd.DataFrame, avg_headway: float) -> np.ndarray:
        """
        Calculate headway values for each time interval throughout the day.
        FIXED VERSION - uses properly converted departure times.
        """
        headways = np.full(self.n_intervals, np.nan)
        
        try:
            if len(service_trips) == 0:
                return headways
            elif len(service_trips) == 1:
                headways.fill(self.interval_hours * 60)
                return headways
            
            # Get stop times for all trips in this service
            trip_ids = service_trips['trip_id'].tolist()
            service_stop_times = self.stop_times_df[
                self.stop_times_df['trip_id'].isin(trip_ids)
            ].copy()
            
            if len(service_stop_times) == 0:
                return headways
            
            # Get first departure for each trip
            first_departures = service_stop_times.loc[
                service_stop_times.groupby('trip_id')['stop_sequence'].idxmin()
            ][['trip_id', 'departure_seconds']].copy()
            
            # Convert departure seconds to hours (now working with numbers)
            first_departures['departure_hour'] = (first_departures['departure_seconds'] // 3600) % 24
            first_departures = first_departures.dropna()
            
            if len(first_departures) == 0:
                return headways
            
            # Calculate headways for each time interval using vectorized operations
            for interval in range(self.n_intervals):
                start_hour = interval * self.interval_hours
                end_hour = (interval + 1) * self.interval_hours
                
                # Find departures in this interval
                interval_departures = first_departures[
                    (first_departures['departure_hour'] >= start_hour) &
                    (first_departures['departure_hour'] < end_hour)
                ]['departure_seconds'].values
                
                headways[interval] = self._calculate_interval_headway_vectorized(interval_departures, avg_headway)
            
            return headways
            
        except Exception as e:
            headways.fill(avg_headway)
            return headways
    
    def _calculate_interval_headway_vectorized(self, departure_times: np.ndarray, avg_headway: float) -> float:
        """
        Calculate headway for a specific time interval using vectorized operations.
        """
        if len(departure_times) >= 2:
            # Sort departure times
            departure_times = np.sort(departure_times)
            
            # Calculate intervals between consecutive departures
            intervals = np.diff(departure_times) / 60  # Convert to minutes
            
            # Filter valid intervals
            valid_intervals = intervals[intervals > 0]
            
            if len(valid_intervals) > 0:
                return np.mean(valid_intervals)
            else:
                return avg_headway
                
        elif len(departure_times) == 1:
            # Single trip - assume infrequent service
            return self.interval_hours * 60
        else:
            # No trips - no service
            return np.nan
    
    def _extract_route_metadata(self, route_id: str) -> Tuple[str, str, str]:
        """
        Extract route metadata from the pre-loaded routes data.
        """
        try:
            route_info = self.routes_df[self.routes_df['route_id'] == route_id]
            
            if len(route_info) > 0:
                route = route_info.iloc[0]
                route_name = route.get('route_short_name', str(route_id))
                agency_id = route.get('agency_id', 'Unknown')
                route_color = route.get('route_color', '000000')
                
                # Clean up route_color (remove # if present)
                if isinstance(route_color, str) and route_color.startswith('#'):
                    route_color = route_color[1:]
                    
                return str(route_name), str(agency_id), str(route_color)
            else:
                return str(route_id), 'Unknown', '000000'
                
        except Exception as e:
            return str(route_id), 'Unknown', '000000'
    
    def _calculate_round_trip_time(self, service_id: str, service_trips: pd.DataFrame) -> float:
        """
        Estimate round-trip time using properly converted departure/arrival times.
        FIXED VERSION - works with numeric time values.
        """
        try:
            if len(service_trips) == 0:
                return self.default_round_trip_time
            
            # Get trip IDs for this service
            trip_ids = service_trips['trip_id'].tolist()
            
            # Get stop times for all trips in this service
            service_stop_times = self.stop_times_df[
                self.stop_times_df['trip_id'].isin(trip_ids)
            ].copy()
            
            if len(service_stop_times) == 0:
                return self.default_round_trip_time
            
            # Calculate trip durations using vectorized operations
            trip_durations = []
            
            # Group by trip_id and calculate duration for each trip
            for trip_id, trip_stops in service_stop_times.groupby('trip_id'):
                if len(trip_stops) >= 2:
                    trip_stops = trip_stops.sort_values('stop_sequence')
                    
                    # Get first departure and last arrival (now numeric)
                    first_departure = trip_stops.iloc[0]['departure_seconds']
                    last_arrival = trip_stops.iloc[-1]['arrival_seconds']
                    
                    if pd.notna(first_departure) and pd.notna(last_arrival):
                        duration_minutes = (last_arrival - first_departure) / 60
                        
                        if duration_minutes > 0:
                            trip_durations.append(duration_minutes)
            
            if len(trip_durations) > 0:
                # Use median one-way time (robust to outliers)
                median_one_way = np.median(trip_durations)
                # Round-trip = 2 × one-way + 15% buffer for turnaround
                round_trip_time = median_one_way * 2 * 1.15
                return round_trip_time
            else:
                return self.default_round_trip_time
                
        except Exception as e:
            return self.default_round_trip_time
    
    def _create_fallback_routes(self) -> List[Dict]:
        """
        Create simple uniform headway routes when extraction fails.
        """
        try:
            # Get unique services from pre-loaded trips
            unique_services = self.trips_df['service_id'].unique()
            fallback_routes = []
            
            for service_id in unique_services:
                headways = np.full(self.n_intervals, self.fallback_headway)
                
                # Try to get basic route info
                service_trips = self.trips_df[self.trips_df['service_id'] == service_id]
                if len(service_trips) > 0:
                    route_id = service_trips.iloc[0]['route_id']
                    route_name, agency_id, route_color = self._extract_route_metadata(route_id)
                else:
                    route_id = service_id
                    route_name = str(service_id)
                    agency_id = 'Unknown'
                    route_color = '000000'
                
                route_data = {
                    'service_id': service_id,
                    'route_id': route_id,
                    'avg_headway': self.fallback_headway,
                    'headways_by_interval': headways,
                    'route_name': route_name,
                    'agency_id': agency_id,
                    'route_color': route_color,
                    'round_trip_time': self.default_round_trip_time
                }
                
                fallback_routes.append(route_data)
            
            return fallback_routes
            
        except Exception as e:
            print(f"❌ Even fallback route creation failed: {e}")
            return []

# =============================================================================
# OPTIMIZATION CONSTRAINT MANAGER (unchanged)
# =============================================================================

class OptimizationConstraints:
    """Manages optimization bounds and constraints for headway variables."""
    
    def __init__(self,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 no_service_headway_value: float = 9999.0):
        self.user_min_headway = user_min_headway
        self.user_max_headway = user_max_headway
        self.min_headway_multiplier = min_headway_multiplier
        self.max_headway_multiplier = max_headway_multiplier
        self.no_service_headway_value = no_service_headway_value
    
    def calculate_route_bounds(self, headways_by_interval: np.ndarray) -> Tuple[float, float]:
        valid_headways = headways_by_interval[~np.isnan(headways_by_interval)]
        
        if len(valid_headways) > 0:
            data_min = np.min(valid_headways) * self.min_headway_multiplier
            data_max = np.max(valid_headways) * self.max_headway_multiplier
            min_headway = max(self.user_min_headway, data_min)
            max_headway = max(min_headway, min(self.user_max_headway, data_max))
        else:
            min_headway = self.user_min_headway
            max_headway = self.user_max_headway
            
        return min_headway, max_headway
    
    def create_optimization_bounds(self, routes: List[RouteConfig]) -> Tuple[np.ndarray, np.ndarray]:
        min_bounds = []
        max_bounds = []
        
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                min_bounds.append(route.min_headway)
                max_bounds.append(route.max_headway)
        
        return np.array(min_bounds), np.array(max_bounds)
    
    def convert_to_optimization_vector(self, routes: List[RouteConfig]) -> np.ndarray:
        all_headways = []
        for route in routes:
            for headway in route.headways_by_interval:
                if np.isnan(headway):
                    all_headways.append(self.no_service_headway_value)
                else:
                    all_headways.append(headway)
        return np.array(all_headways)
    
    def convert_from_optimization_vector(self, headway_vector: np.ndarray, 
                                       routes: List[RouteConfig]) -> None:
        expected_length = sum(len(route.headways_by_interval) for route in routes)
        if len(headway_vector) != expected_length:
            raise ValueError(f"Vector length mismatch: {len(headway_vector)} vs {expected_length}")
        
        idx = 0
        for route in routes:
            for interval in range(len(route.headways_by_interval)):
                new_headway = headway_vector[idx]
                
                if new_headway >= self.no_service_headway_value * 0.9:
                    route.headways_by_interval[interval] = np.nan
                else:
                    route.headways_by_interval[interval] = new_headway
                    
                idx += 1

# =============================================================================
# VEHICLE CONSTRAINT CALCULATOR (unchanged)
# =============================================================================

class VehicleConstraintCalculator:
    """Calculates vehicle requirements and enforces fleet constraints."""
    
    def __init__(self, routes: List[RouteConfig], no_service_threshold: float = 9999.0):
        self.routes = routes
        self.no_service_threshold = no_service_threshold
        self.n_intervals = len(routes[0].headways_by_interval) if routes else 0
        
        # Calculate baseline vehicle requirements
        self.baseline_vehicles_by_interval = self._calculate_baseline_vehicles()
        self.baseline_total_vehicles = np.max(self.baseline_vehicles_by_interval)
    
    def _calculate_baseline_vehicles(self) -> np.ndarray:
        vehicles_by_interval = np.zeros(self.n_intervals)
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = route.headways_by_interval[interval]
                if not np.isnan(headway) and headway > 0:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
        
        return vehicles_by_interval
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        vehicles_by_interval = np.zeros(self.n_intervals)
        idx = 0
        
        for route in self.routes:
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    vehicles_by_interval[interval] += vehicles_needed
                
                idx += 1
        
        return vehicles_by_interval
    
    def constraint_total_fleet_size(self, headway_vector: np.ndarray, 
                                  max_fleet_size: float) -> float:
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        return max_fleet_size - peak_vehicles
    
    def constraint_percentage_increase(self, headway_vector: np.ndarray, 
                                     max_increase_percent: float) -> float:
        vehicles_by_interval = self.calculate_vehicles_needed(headway_vector)
        peak_vehicles = np.max(vehicles_by_interval)
        
        max_allowed = self.baseline_total_vehicles * (1 + max_increase_percent / 100)
        return max_allowed - peak_vehicles
    
    def constraint_agency_specific(self, headway_vector: np.ndarray, 
                                 agency_limits: Dict[str, float]) -> List[float]:
        agency_vehicles = {}
        idx = 0
        
        for route in self.routes:
            agency_id = route.agency_id
            if agency_id not in agency_vehicles:
                agency_vehicles[agency_id] = np.zeros(self.n_intervals)
            
            for interval in range(self.n_intervals):
                headway = headway_vector[idx]
                
                if headway < self.no_service_threshold * 0.9:
                    vehicles_needed = route.round_trip_time / headway
                    agency_vehicles[agency_id][interval] += vehicles_needed
                
                idx += 1
        
        constraints = []
        for agency_id, limit in agency_limits.items():
            if agency_id in agency_vehicles:
                peak_vehicles = np.max(agency_vehicles[agency_id])
                constraints.append(limit - peak_vehicles)
            else:
                constraints.append(limit)
        
        return constraints

# =============================================================================
# MAIN OPTIMIZATION DATA STRUCTURE (simplified initialization)
# =============================================================================

class HeadwayOptimizationData:
    """Main class that coordinates all components for headway optimization."""
    
    def __init__(self, 
                 gtfs_path: str,
                 date: str = "20230814", 
                 interval_hours: int = 3,
                 user_min_headway: float = 5.0,
                 user_max_headway: float = 120.0,
                 min_headway_multiplier: float = 0.5,
                 max_headway_multiplier: float = 2.0,
                 default_operating_hours: Tuple[int, int] = (6, 22),
                 fallback_headway: float = 30.0,
                 no_service_headway_value: float = 9999.0,
                 default_round_trip_time: float = 60.0):
        """Initialize the complete optimization data structure."""
        # Store configuration
        self.gtfs_path = gtfs_path
        self.date = date
        self.interval_hours = interval_hours
        self.n_intervals = 24 // interval_hours
        self.default_operating_hours = default_operating_hours
        
        # Initialize sub-components with timing
        print("=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===")
        total_start = time.time()
        
        # 1. GTFS Data Extraction
        print("1. Creating GTFS data extractor...")
        extractor_start = time.time()
        self.extractor = GTFSDataExtractor(
            gtfs_path=gtfs_path,
            date=date,
            interval_hours=interval_hours,
            fallback_headway=fallback_headway,
            default_round_trip_time=default_round_trip_time
        )
        extractor_time = time.time() - extractor_start
        print(f"   ✅ Extractor created in {extractor_time:.2f} seconds")
        
        # 2. Extract route data
        print("2. Extracting route data...")
        extraction_start = time.time()
        route_data_list = self.extractor.extract_all_routes()
        extraction_time = time.time() - extraction_start
        print(f"   ✅ Route extraction completed in {extraction_time:.2f} seconds")
        
        # 3. Constraint Management
        print("3. Setting up optimization constraints...")
        constraints_start = time.time()
        self.constraints = OptimizationConstraints(
            user_min_headway=user_min_headway,
            user_max_headway=user_max_headway,
            min_headway_multiplier=min_headway_multiplier,
            max_headway_multiplier=max_headway_multiplier,
            no_service_headway_value=no_service_headway_value
        )
        constraints_time = time.time() - constraints_start
        print(f"   ✅ Constraints setup in {constraints_time:.3f} seconds")
        
        # 4. Create RouteConfig objects
        print("4. Creating route configurations...")
        config_start = time.time()
        self.routes = []
        for route_data in route_data_list:
            min_headway, max_headway = self.constraints.calculate_route_bounds(
                route_data['headways_by_interval']
            )
            
            route_config = RouteConfig(
                service_id=route_data['service_id'],
                route_name=route_data['route_name'],
                agency_id=route_data['agency_id'],
                headways_by_interval=route_data['headways_by_interval'],
                min_headway=min_headway,
                max_headway=max_headway,
                operating_hours=default_operating_hours,
                route_color=route_data['route_color'],
                interval_hours=interval_hours,
                round_trip_time=route_data['round_trip_time']
            )
            
            self.routes.append(route_config)
        config_time = time.time() - config_start
        print(f"   ✅ Route configs created in {config_time:.3f} seconds")
        
        # 5. Vehicle Constraint Calculator
        print("5. Initializing vehicle constraint calculator...")
        vehicle_start = time.time()
        self.vehicle_calculator = VehicleConstraintCalculator(
            routes=self.routes,
            no_service_threshold=no_service_headway_value
        )
        vehicle_time = time.time() - vehicle_start
        print(f"   ✅ Vehicle calculator initialized in {vehicle_time:.3f} seconds")
        
        total_time = time.time() - total_start
        print(f"✅ Successfully initialized optimization data for {len(self.routes)} routes")
        print(f"   📊 Total initialization time: {total_time:.2f} seconds")
        print(f"   🚌 Baseline fleet size: {self.vehicle_calculator.baseline_total_vehicles:.1f} vehicles")
        print()
        
        # Timing breakdown
        print("⏱️  TIMING BREAKDOWN:")
        print(f"   Extractor creation: {extractor_time:.2f}s ({100*extractor_time/total_time:.1f}%)")
        print(f"   Route extraction: {extraction_time:.2f}s ({100*extraction_time/total_time:.1f}%)")
        print(f"   Constraints setup: {constraints_time:.3f}s ({100*constraints_time/total_time:.1f}%)")
        print(f"   Route configs: {config_time:.3f}s ({100*config_time/total_time:.1f}%)")
        print(f"   Vehicle calculator: {vehicle_time:.3f}s ({100*vehicle_time/total_time:.1f}%)")
        print()
    
    # Core optimization interface methods (unchanged)
    def get_optimization_vector(self) -> np.ndarray:
        return self.constraints.convert_to_optimization_vector(self.routes)
    
    def get_bounds(self) -> Tuple[np.ndarray, np.ndarray]:
        return self.constraints.create_optimization_bounds(self.routes)
    
    def set_headways(self, headway_vector: np.ndarray):
        self.constraints.convert_from_optimization_vector(headway_vector, self.routes)
    
    def calculate_vehicles_needed(self, headway_vector: np.ndarray) -> np.ndarray:
        return self.vehicle_calculator.calculate_vehicles_needed(headway_vector)
    
    def vehicle_constraint_total(self, headway_vector: np.ndarray, max_fleet_size: float) -> float:
        return self.vehicle_calculator.constraint_total_fleet_size(headway_vector, max_fleet_size)
    
    def vehicle_constraint_percent_increase(self, headway_vector: np.ndarray, 
                                          max_increase_percent: float) -> float:
        return self.vehicle_calculator.constraint_percentage_increase(headway_vector, max_increase_percent)
    
    def vehicle_constraint_agency_specific(self, headway_vector: np.ndarray, 
                                         agency_limits: Dict[str, float]) -> List[float]:
        return self.vehicle_calculator.constraint_agency_specific(headway_vector, agency_limits)
    
    def get_route_summary(self, include_interval_details: bool = True) -> pd.DataFrame:
        data = []
        for route in self.routes:
            active_headways = route.headways_by_interval[~np.isnan(route.headways_by_interval)]
            avg_headway = np.mean(active_headways) if len(active_headways) > 0 else np.nan
            
            row_data = {
                'service_id': route.service_id,
                'route_name': route.route_name,
                'agency': route.agency_id,
                'avg_headway': avg_headway,
                'min_headway': route.min_headway,
                'max_headway': route.max_headway,
                'intervals_with_service': np.sum(~np.isnan(route.headways_by_interval)),
                'route_color': route.route_color,
                'round_trip_time': route.round_trip_time
            }
            
            if include_interval_details:
                for i in range(self.n_intervals):
                    start_hour = i * self.interval_hours
                    end_hour = (i + 1) * self.interval_hours
                    headway_val = route.headways_by_interval[i]
                    row_data[f'headway_{start_hour:02d}-{end_hour:02d}h'] = headway_val
            
            data.append(row_data)
        
        return pd.DataFrame(data)
    
    def get_interval_labels(self) -> List[str]:
        labels = []
        for i in range(self.n_intervals):
            start_hour = i * self.interval_hours
            end_hour = (i + 1) * self.interval_hours
            labels.append(f"{start_hour:02d}-{end_hour:02d}h")
        return labels
    
    def get_optimization_info(self) -> Dict:
        current_vector = self.get_optimization_vector()
        valid_headways = current_vector[current_vector < self.constraints.no_service_headway_value * 0.9]
        no_service_count = len(current_vector) - len(valid_headways)
        
        return {
            'n_routes': len(self.routes),
            'n_intervals': self.n_intervals,
            'interval_hours': self.interval_hours,
            'total_variables': len(current_vector),
            'service_periods': len(valid_headways),
            'no_service_periods': no_service_count,
            'service_coverage_pct': 100 * len(valid_headways) / len(current_vector) if len(current_vector) > 0 else 0,
            'mean_headway': valid_headways.mean() if len(valid_headways) > 0 else np.nan,
            'min_headway': valid_headways.min() if len(valid_headways) > 0 else np.nan,
            'max_headway': valid_headways.max() if len(valid_headways) > 0 else np.nan,
            'std_headway': valid_headways.std() if len(valid_headways) > 0 else np.nan,
            'interval_labels': self.get_interval_labels(),
            'user_min_headway': self.constraints.user_min_headway,
            'user_max_headway': self.constraints.user_max_headway,
            'baseline_total_vehicles': self.vehicle_calculator.baseline_total_vehicles,
            'baseline_vehicles_by_interval': self.vehicle_calculator.baseline_vehicles_by_interval
        }

# =============================================================================
# EXAMPLE USAGE WITH TIMING
# =============================================================================

if __name__ == "__main__":
    print("=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE (FIXED VERSION) ===")
    
    gtfs_path = '../data/external/study_area_gtfs_bus.zip'
    
    total_start = time.time()
    opt_data = HeadwayOptimizationData(
        gtfs_path=gtfs_path,
        date="20230814",
        interval_hours=3,
        user_min_headway=5.0,
        user_max_headway=120.0,
        min_headway_multiplier=0.5,
        max_headway_multiplier=2.0,
        default_operating_hours=(6, 22),
        fallback_headway=30.0,
        no_service_headway_value=9999.0,
        default_round_trip_time=60.0
    )
    total_time = time.time() - total_start
    
    print(f"🎯 TOTAL PIPELINE TIME: {total_time:.2f} seconds")
    print(f"⚡ Average time per route: {total_time/len(opt_data.routes):.3f} seconds")
    print()
    
    # Show summary
    summary_df = opt_data.get_route_summary(include_interval_details=False)
    print("Core route summary:")
    print(summary_df[['service_id', 'route_name', 'agency', 'avg_headway', 'round_trip_time']].round(1).head(10))
    print()
    
    opt_info = opt_data.get_optimization_info()
    print("Optimization setup:")
    print(f"  Total decision variables: {opt_info['total_variables']} ({opt_info['n_routes']} routes × {opt_info['n_intervals']} intervals)")
    print(f"  Time intervals: {opt_info['interval_labels']}")
    print(f"  Service periods: {opt_info['service_periods']}/{opt_info['total_variables']} ({opt_info['service_coverage_pct']:.1f}%)")
    print(f"  User constraints: {opt_info['user_min_headway']:.1f} - {opt_info['user_max_headway']:.1f} minutes")
    print(f"  Baseline fleet size: {opt_info['baseline_total_vehicles']:.1f} vehicles")

=== CREATING HEADWAY OPTIMIZATION DATA STRUCTURE (FIXED VERSION) ===
=== INITIALIZING HEADWAY OPTIMIZATION DATA STRUCTURE ===
1. Creating GTFS data extractor...
⏱️  Loading GTFS feed from ../data/external/study_area_gtfs_bus.zip...
✅ GTFS feed loaded in 3.20 seconds
⏱️  Pre-loading GTFS tables...
   📊 Loaded 13,974 trips
   📊 Loaded 703,721 stop times
   📊 Loaded 187 routes
   🔄 Converting departure times...
   🔍 Departure time column type: string
   🔍 Sample departure times: ['05:30:00', '05:34:00', '05:35:00']
   ✅ Converted 703,721/703,721 departure times (100.0%)
   ✅ Time conversion completed in 1.85 seconds
✅ Data pre-loaded in 1.95 seconds
   ✅ Extractor created in 5.15 seconds
2. Extracting route data...
⏱️  Extracting GTFS data with 3-hour intervals...
📊 Found 278 unique services to process
   📈 Progress: 50/278 (18.0%) - Elapsed: 9.5s - Success rate: 50/50 (100.0%)
   📈 Progress: 100/278 (36.0%) - Elapsed: 12.1s - Success rate: 100/100 (100.0%)
   📈 Progress: 150/278 (54.0%) 

In [59]:
summary_df

AttributeError: 'Index' object has no attribute '_format_flat'

    service_id route_name  agency  avg_headway  min_headway  max_headway  \
0         1221         A1  OP6801    52.320346     5.000000        120.0   
1         1302         A1  OP6801    59.623204     5.000000        120.0   
2         1303         A1  OP6801    58.956044     5.769231        120.0   
3         1304         A1  OP6801    47.879630     8.500000        120.0   
4         1305         A1  OP6801    65.729167    11.875000        120.0   
..         ...        ...     ...          ...          ...          ...   
273       7986         29  OP8945   140.000000    30.000000        120.0   
274       7987         29  OP8945    90.000000    30.000000        120.0   
275       8242         81   OP932    28.000000    11.000000         60.0   
276       8243         81   OP932   180.000000    90.000000        120.0   
277       8398          9   OP932   180.000000    90.000000        120.0   

     intervals_with_service route_color  round_trip_time  
0                         7 