In [5]:
import pandas as pd
import sys
import numpy as np
import heapq
import time
from collections import defaultdict, deque
import itertools
import os

In [6]:
same_airports = {
    'DOH': "BAH",
    'SZX': "HKG",
    'CAN': "HKG",
    'STN': 'LHR',
    'LTN': 'LHR'
}

RE-DO Preporcessing solve the Remainder

In [9]:
# =============================================
# Data Reading Functions
# =============================================
def read_capacity(CAPACITY_FILE, airport_substitutions=None):
    capacity = pd.read_csv(CAPACITY_FILE, sep=',')

    capacity['deptime'] = pd.to_datetime(capacity['deptime'])
    capacity['arrtime'] = pd.to_datetime(capacity['arrtime'])

    # Map Weekday_Z to day number (0=Mon, 6=Sun)
    day_map = {'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6}
    capacity['day'] = capacity['Weekday_Z'].map(day_map)

    # Calculate total minutes from start of week (Monday 00:00)
    # Dep Time = Day * 1440 + Hour * 60 + Minute
    capacity['dep_time'] = capacity['day'] * 1440 + capacity['deptime'].dt.hour * 60 + capacity['deptime'].dt.minute
    capacity['DD_Z'] = capacity['DD_Z'].fillna(0)
    capacity['arr_time'] = capacity['day'] * 1440 + capacity['arrtime'].dt.hour * 60 + capacity['arrtime'].dt.minute + capacity['DD_Z']*1440

    # Check for negative durations which indicate data errors
    if (capacity['arr_time'] < capacity['dep_time']).any():
        print("Warning: Some flights have arrival time earlier than departure time after processing. Check data.")
        # Consider dropping or fixing these rows

    # --- Column Renaming and Selection ---
    rename_columns = {
        'Net Payload': 'cap_kg',
        'Net Volume': 'cap_m3',
        'Orig': 'ori',
        'Dest': 'des',
        'Flight Number': 'flight_number',
        'A/C': 'aircraft_type'
    }
    capacity = capacity.rename(columns=rename_columns)

    # Define desired columns
    columns = ['flight_number', 'ori', 'des', 'aircraft_type', 'dep_time', 'arr_time', 'day', 'cap_kg', 'cap_m3']
    capacity = capacity[[col for col in columns if col in capacity.columns]]

    # CONVERT SAME AIRPORTS FROM DICTIONARY - same_airports - CONVERT EVERY KEY TO ITS VALUE
    for key, value in airport_substitutions.items():
        capacity.loc[capacity['ori'] == key, 'ori'] = value
        capacity.loc[capacity['des'] == key, 'des'] = value
    capacity['key'] = capacity['ori'] + '/'+ capacity['des'] +'/'+ capacity['dep_time'].astype(str)

    capacity['dep_time'] = capacity['dep_time'].astype(int)
    capacity['arr_time'] = capacity['arr_time'].astype(int)
    capacity['day'] = capacity['day'].astype(int)
    print(f"Capacity data read: {len(capacity)} rows.")
    return capacity


def read_market(MARKET_FILE, airport_substitutions=None):
    market = pd.read_csv(MARKET_FILE, sep=';')

    market = market.rename(columns={'origin': 'ori', 'destination': 'des', 'Market CHW': 'demand', 'Day': 'day'})
    if 'product' in market.columns: market.drop(columns=['product'], inplace=True)
    if 'Market Allin Yield' in market.columns: market.drop(columns=['Market Allin Yield'], inplace=True)

    day_map = {'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6}
    market['day'] = market['day'].map(day_map)

    # convert HH:MM to minutes
    time_minutes = market['time'].str.split(':', expand=True).astype(int).apply(lambda x: x[0] * 60 + x[1], axis=1)
    market['time'] = market['day'] * 1440 + time_minutes

    # SUBSITUTE AIRPORTS FROM same_airports DICTIONERY - CONVERT EVERY KEY TO ITS VALUE
    for key, value in airport_substitutions.items():
        market.loc[market['ori'] == key, 'ori'] = value
        market.loc[market['des'] == key, 'des'] = value

    # Add original key
    market['key'] = market['ori'] + '/' + market['des'] + '/' + market['time'].astype(str)

    # WE CREATED DUPLICATES KEY AS A RESULT OF CONVERTING SAME AIRPORTS
    # MERGE THE DEMAND VALUES FOR THE SAME KEY
    market = market.groupby(['key', 'ori', 'des', 'day', 'time']).agg({'demand': 'sum'}).reset_index()
    # Select relevant columns
    market = market[['ori', 'des', 'demand', 'day', 'time', 'key']].copy()

    # Convert types
    market['time'] = market['time'].astype(int)
    market['day'] = market['day'].astype(int)
    market['demand'] = pd.to_numeric(market['demand'], errors='coerce').fillna(0)
    print(f"Market data read: {len(market)} rows.")
    return market

# Current

In [8]:
import pandas as pd
import numpy as np
import heapq
import time
from collections import defaultdict, deque
import sys
import itertools # Still needed for ID generation if we keep _get_or_create_flight_int_id, but not essential if just iterating
import os

same_airports = {
    'DOH': "BAH",
    'SZX': "HKG",
    'CAN': "HKG",
    'STN': 'LHR',
    'LTN': 'LHR'
}

# --- FlightNetwork Class (Sequential Implementation) ---

class FlightNetwork:
    """
    Represents the flight network and finds K-shortest paths sequentially.
    Uses integer IDs for flights and minute-based times.
    Costs represent wait/layover time.
    """

    def __init__(self, min_connection_minutes=60, max_connection_minutes=7*24*60):
        if not (0 <= min_connection_minutes <= max_connection_minutes):
             raise ValueError("Invalid connection time parameters.")
        self.min_connection_minutes = min_connection_minutes
        self.max_connection_minutes = max_connection_minutes
        self.week_minutes = 7 * 24 * 60
        self.flight_int_to_data = {}
        self.flight_str_to_int = {}
        self._next_flight_int_id = 0
        self.airports = set()
        self.demands = {}
        self.flights_by_origin = defaultdict(list)
        self.flights_by_destination = defaultdict(list)
        self.outgoing_edges = defaultdict(list) # int_flight_id -> [(neighbor_int_flight_id, layover_cost)]
        self.source_connections = defaultdict(list) # demand_id_str -> [(first_int_flight_id, wait_cost)]
        self.demand_destinations = {}
        self.stats = {
            "total_flights": 0, "total_demands": 0, "flight_connections": 0,
            "source_connections": 0, "disconnected_sources": 0,
            "total_paths_found": 0, "demands_with_paths": 0,
            "time_data_loading": 0.0, "time_graph_building": 0.0,
            "time_path_finding": 0.0, # Will now be sequential time
        }

    def _get_or_create_flight_int_id(self, flight_str_id):
        """Assigns a unique integer ID to a flight string ID."""
        if flight_str_id not in self.flight_str_to_int:
            int_id = self._next_flight_int_id
            self.flight_str_to_int[flight_str_id] = int_id
            self.flight_int_to_data[int_id] = {'original_str_id': flight_str_id}
            self._next_flight_int_id += 1
            return int_id
        return self.flight_str_to_int[flight_str_id]

    def load_and_process_data(self, schedule_df, demand_df):
        """Load and process data using integer IDs (Sequential)."""
        # (Identical to the previous version's load_and_process_data)
        print("Processing flight schedule...")
        start_time = time.time(); required_schedule_cols = ['ori', 'des', 'day', 'dep_time', 'arr_time', 'cap_kg', 'cap_m3', 'flight_number']
        if not all(col in schedule_df.columns for col in required_schedule_cols): raise ValueError(f"Schedule DataFrame missing required columns: {[c for c in required_schedule_cols if c not in schedule_df.columns]}")
        schedule_df['flight_str_id'] = schedule_df.apply(lambda r: f"{r['flight_number']}_{r['day']}_{r['ori']}_{int(r['dep_time'])}", axis=1)
        if schedule_df['flight_str_id'].duplicated().any(): print("Warning: Duplicate flight string IDs generated. Keeping first."); schedule_df = schedule_df.drop_duplicates(subset=['flight_str_id'], keep='first')
        for _, r in schedule_df.iterrows():
            int_id = self._get_or_create_flight_int_id(r['flight_str_id'])
            data = {'original_str_id': r['flight_str_id'], 'flight_number': r['flight_number'],'origin': r['ori'], 'destination': r['des'], 'day': int(r['day']),'dep_minutes': int(r['dep_time']), 'arr_minutes': int(r['arr_time']),'capacity_kg': r['cap_kg'], 'capacity_m3': r['cap_m3']}
            self.flight_int_to_data[int_id].update(data)
            self.flights_by_origin[r['ori']].append(int_id); self.flights_by_destination[r['des']].append(int_id)
            self.airports.add(r['ori']); self.airports.add(r['des'])
        self.stats["total_flights"] = len(self.flight_int_to_data); load_time_schedule = time.time() - start_time; print(f"Schedule processed ({load_time_schedule:.2f}s)")
        print("Processing demand data...")
        start_time_demand = time.time(); required_demand_cols = ['ori', 'des', 'day', 'time', 'demand', 'key']
        if not all(col in demand_df.columns for col in required_demand_cols): raise ValueError(f"Demand DataFrame missing required columns: {[c for c in required_demand_cols if c not in demand_df.columns]}")
        demand_df['demand_id'] = demand_df['key']
        if demand_df['demand_id'].duplicated().any(): print("Warning: Duplicate demand_ids detected. Keeping first."); demand_df = demand_df.drop_duplicates(subset=['demand_id'], keep='first')
        for _, r in demand_df.iterrows():
            demand_id = r['demand_id']; data = {'id': demand_id, 'origin': r['ori'], 'destination': r['des'],'day': int(r['day']), 'minutes': int(r['time']),'demand': r['demand'], 'key': r['key']}
            self.demands[demand_id] = data; self.demand_destinations[demand_id] = r['des']
        self.stats["total_demands"] = len(self.demands); load_time_demand = time.time() - start_time_demand; print(f"Demand processed ({load_time_demand:.2f}s)")
        self.stats["time_data_loading"] = load_time_schedule + load_time_demand; print(f"Data loaded. Network: {self.stats['total_flights']} flights (int IDs), {len(self.airports)} airports, {self.stats['total_demands']} demands.")
        self._build_connectivity_graph() # Call graph build


    def _calculate_time_diff_with_rollover(self, later_time, earlier_time):
        """Calculates time difference handling weekly rollover."""
        time_diff = later_time - earlier_time
        if time_diff < 0:
            time_diff += self.week_minutes * ((-time_diff + self.week_minutes - 1) // self.week_minutes)
        return time_diff

    def _build_connectivity_graph(self):
        """Build connectivity using integer flight IDs and optimize connections."""
        # (Identical to the previous version's _build_connectivity_graph)
        print("Building connectivity graph (using integer IDs)..."); start_time = time.time(); flight_connections_count = 0
        for airport in self.airports:
            arriving_ids = self.flights_by_destination.get(airport, []); departing_ids = self.flights_by_origin.get(airport, [])
            if not arriving_ids or not departing_ids: continue
            departing_sorted = sorted(departing_ids, key=lambda i: self.flight_int_to_data[i]['dep_minutes'])
            for arr_id in arriving_ids:
                arr_mins = self.flight_int_to_data[arr_id]['arr_minutes']
                for dep_id in departing_sorted:
                    dep_mins = self.flight_int_to_data[dep_id]['dep_minutes']
                    layover = self._calculate_time_diff_with_rollover(dep_mins, arr_mins)
                    if self.min_connection_minutes <= layover <= self.max_connection_minutes:
                        self.outgoing_edges[arr_id].append((dep_id, layover)); flight_connections_count += 1
                    elif layover > self.max_connection_minutes: break # Optimization
        self.stats["flight_connections"] = flight_connections_count
        source_connections_count = 0; disconnected_sources_count = 0
        for d_id, d in self.demands.items():
            origin = d['origin']; d_mins = d['minutes']; has_conn = False
            departing_ids = self.flights_by_origin.get(origin, [])
            for f_id in departing_ids:
                f_mins = self.flight_int_to_data[f_id]['dep_minutes']
                wait = self._calculate_time_diff_with_rollover(f_mins, d_mins)
                if 0 <= wait <= self.max_connection_minutes:
                    self.source_connections[d_id].append((f_id, wait)); source_connections_count += 1; has_conn = True
            if not has_conn: disconnected_sources_count += 1
        self.stats["source_connections"] = source_connections_count; self.stats["disconnected_sources"] = disconnected_sources_count
        self.stats["time_graph_building"] = time.time() - start_time; print(f"Connectivity graph built ({self.stats['time_graph_building']:.2f}s)")
        print(f"  {self.stats['flight_connections']} flight-to-flight connections"); print(f"  {self.stats['source_connections']} source-to-flight connections")
        if self.stats['disconnected_sources'] > 0: print(f"  WARNING: {self.stats['disconnected_sources']} demands ({ (self.stats['disconnected_sources']/self.stats['total_demands'])*100 if self.stats['total_demands'] > 0 else 0 :.1f}%) have NO initial flight connection.")
        else: print("  All demands have at least one potential initial flight connection.")


    # --- Internal Pathfinding Methods (Sequential) ---

    def _dijkstra(self, start_node_id, target_airport, excluded_nodes=None, excluded_edges=None):
        """
        Internal Dijkstra implementation for the class.
        Uses integer IDs for flights. start_node_id can be demand_id (str) or int_flight_id.
        """
        excluded_nodes = excluded_nodes or set() # Should contain int_flight_ids
        excluded_edges = excluded_edges or set() # Should contain tuples (int_id, int_id) or (str_id, int_id)

        pq = [(0, start_node_id)] # (cost, current_node_id (str or int))
        distances = defaultdict(lambda: float('inf'))
        distances[start_node_id] = 0
        previous_nodes = {} # node_id -> predecessor_node_id

        min_target_cost = float('inf')
        final_target_int_flight_id = None # The int_flight_id of the best target found

        while pq:
            current_cost, current_id = heapq.heappop(pq)

            if current_cost > distances[current_id] or current_cost >= min_target_cost:
                 continue

            is_flight = isinstance(current_id, int)
            if is_flight:
                if current_id in excluded_nodes:
                     continue
                flight_data = self.flight_int_to_data.get(current_id)
                if flight_data and flight_data.get('destination') == target_airport:
                    if current_cost < min_target_cost:
                        min_target_cost = current_cost
                        final_target_int_flight_id = current_id

            # Determine outgoing edges based on node type
            if isinstance(current_id, str): # Must be the starting demand_id
                 edges = self.source_connections.get(current_id, []) # Edges are (int_flight_id, cost)
            elif is_flight:
                 edges = self.outgoing_edges.get(current_id, []) # Edges are (int_flight_id, cost)
            else:
                 edges = []

            for neighbor_int_id, weight in edges:
                edge = (current_id, neighbor_int_id) # Edge represented by (int_id, int_id) or (str_id, int_id)
                if neighbor_int_id in excluded_nodes: continue
                if is_flight and edge in excluded_edges: continue # Check edge exclusion only for flight->flight

                new_cost = current_cost + weight
                if new_cost < distances[neighbor_int_id]:
                    distances[neighbor_int_id] = new_cost
                    previous_nodes[neighbor_int_id] = current_id # Predecessor can be str or int
                    heapq.heappush(pq, (new_cost, neighbor_int_id))

        # Reconstruct path (will contain int_flight_ids)
        if final_target_int_flight_id is not None:
            path = deque()
            curr = final_target_int_flight_id
            while curr != start_node_id:
                 path.appendleft(curr)
                 if curr not in previous_nodes:
                     print(f"Error (Dijkstra): Path reconstruction failed. Predecessor missing for {curr}.")
                     return float('inf'), []
                 curr = previous_nodes[curr]
            return min_target_cost, list(path)
        else:
            return float('inf'), []

    def _calculate_path_cost(self, demand_id_str, path_int_flight_ids):
        """Internal method to calculate cost for a path of integer flight IDs."""
        if demand_id_str not in self.demands: return float('inf')
        if not path_int_flight_ids: return 0
        total_cost = 0.0; current_node_id = demand_id_str
        for i, int_flight_id in enumerate(path_int_flight_ids):
            edge_found = False
            if i == 0: edges_to_search = self.source_connections.get(current_node_id, [])
            else:
                if not isinstance(current_node_id, int): return float('inf') # Should be flight ID now
                edges_to_search = self.outgoing_edges.get(current_node_id, [])
            for neighbor_int_id, weight in edges_to_search:
                if neighbor_int_id == int_flight_id:
                    total_cost += weight; current_node_id = int_flight_id; edge_found = True; break
            if not edge_found:
                print(f"Error (Cost Calc): Connection not found for leg {i+1} of demand {demand_id_str}")
                return float('inf')
        return total_cost

    def find_k_shortest_paths(self, demand_id_str, k):
        """
        Internal method using Yen's algorithm (sequential).
        Calls internal _dijkstra and _calculate_path_cost.
        """
        if demand_id_str not in self.demands: return []
        if k <= 0: return []

        target_airport = self.demand_destinations.get(demand_id_str)
        if not target_airport: return [] # Should not happen

        A = []; B_heap = []; B_paths_set = set() # Use set for faster duplicate check in heap

        cost1, path1_ints = self._dijkstra(demand_id_str, target_airport)
        if cost1 == float('inf'): return [] # No path exists
        A.append((cost1, path1_ints))

        for i in range(1, k): # Yen's loop
            if i - 1 >= len(A): break # Found fewer than k paths so far
            prev_cost, prev_path_ints = A[i-1]
            for spur_node_idx in range(len(prev_path_ints)):
                spur_node_int_id = prev_path_ints[spur_node_idx]
                root_path_ints = prev_path_ints[:spur_node_idx]
                root_cost = self._calculate_path_cost(demand_id_str, root_path_ints)
                if root_cost == float('inf'): continue

                excluded_nodes = set(root_path_ints)
                excluded_edges = set()
                for _, path_a_ints in A:
                    if len(path_a_ints) > spur_node_idx and path_a_ints[:spur_node_idx] == root_path_ints:
                        if path_a_ints[spur_node_idx] == spur_node_int_id:
                            if spur_node_idx + 1 < len(path_a_ints):
                                excluded_edges.add((spur_node_int_id, path_a_ints[spur_node_idx + 1]))

                # Call internal Dijkstra for spur path
                spur_cost, spur_path_segment_ints = self._dijkstra(
                    spur_node_int_id, target_airport, excluded_nodes, excluded_edges
                )

                if spur_cost != float('inf'):
                    full_path_ints = root_path_ints + [spur_node_int_id] + spur_path_segment_ints
                    total_cost = root_cost + spur_cost
                    full_path_tuple = tuple(full_path_ints)
                    if full_path_tuple not in B_paths_set:
                        heapq.heappush(B_heap, (total_cost, full_path_ints))
                        B_paths_set.add(full_path_tuple)

            found_next = False
            while B_heap:
                best_cost, best_path_ints = heapq.heappop(B_heap)
                B_paths_set.discard(tuple(best_path_ints)) # Use discard
                is_in_A = any(p == best_path_ints for c, p in A)
                if not is_in_A:
                    A.append((best_cost, best_path_ints)); found_next = True; break
            if not found_next: break # No more distinct paths found in heap

        # Return list of (cost, path_list_of_int_ids)
        return A

    # --- Main Sequential Path Finding Method ---

    def find_all_k_shortest_paths_sequential(self, k=3):
        """
        Finds up to K shortest paths for all demands sequentially.
        """
        overall_start_time = time.time()
        all_paths_results = {} # demand_id_str -> list[list[int_flight_id]]
        num_demands = len(self.demands)
        if num_demands == 0: return {}

        print(f"\nFinding up to {k} shortest paths for {num_demands} demands (sequentially)...")

        processed_count = 0
        demands_with_paths_count = 0
        total_paths_found_count = 0
        report_interval = max(1, num_demands // 20) # Progress reporting

        for demand_id in self.demands.keys():
            # Call the internal find_k_shortest_paths method
            paths_with_costs = self.find_k_shortest_paths(demand_id, k) # Gets list of (cost, path)

            if paths_with_costs:
                # Store only the paths (list of lists of int IDs)
                paths_list_ints = [path for cost, path in paths_with_costs]
                all_paths_results[demand_id] = paths_list_ints
                demands_with_paths_count += 1
                total_paths_found_count += len(paths_list_ints)

            processed_count += 1
            if processed_count % report_interval == 0 or processed_count == num_demands:
                elapsed = time.time() - overall_start_time
                print(f"  Processed {processed_count}/{num_demands} demands... ({elapsed:.2f}s)")

        # Stats updates and Reporting
        total_time = time.time() - overall_start_time
        self.stats["total_paths_found"] = total_paths_found_count
        self.stats["demands_with_paths"] = demands_with_paths_count
        self.stats["time_path_finding"] = total_time
        print(f"\nSequential path finding finished in {total_time:.2f}s")
        # --- Connectivity Reporting ---
        print("\nConnectivity Summary:")
        print(f"  Total demands processed: {num_demands}")
        disconnected_start = self.stats['disconnected_sources']
        if disconnected_start > 0: pct_disconnected_start = (disconnected_start / num_demands) * 100 if num_demands > 0 else 0; print(f"  Demands with NO initial flight connection: {disconnected_start} ({pct_disconnected_start:.1f}%)")
        else: print("  All demands had at least one potential initial flight connection.")
        demands_without_paths = num_demands - demands_with_paths_count
        pct_with_paths = (demands_with_paths_count / num_demands) * 100 if num_demands > 0 else 0
        pct_without_paths = 100.0 - pct_with_paths
        print(f"  Demands for which at least 1 path was found: {demands_with_paths_count} ({pct_with_paths:.1f}%)")
        print(f"  Demands for which NO path was found (up to K): {demands_without_paths} ({pct_without_paths:.1f}%)")
        print(f"  Total individual paths generated (up to K per demand): {total_paths_found_count}")
        if demands_with_paths_count > 0: avg_paths = total_paths_found_count / demands_with_paths_count; print(f"  Average paths per connected demand: {avg_paths:.2f}")

        return all_paths_results


    def get_paths_as_dataframe(self, paths_dict_ints):
        """Converts dict of paths (int IDs) to DataFrame (Sequential)."""
        # (Identical to the previous version's get_paths_as_dataframe)
        print("\nConverting paths dictionary (int IDs) to DataFrame...")
        start_convert_time = time.time(); data = []
        if not paths_dict_ints: return pd.DataFrame()
        for demand_id, path_list_ints in paths_dict_ints.items():
            if not path_list_ints or demand_id not in self.demands: continue
            demand = self.demands[demand_id]
            for i, path_ints in enumerate(path_list_ints):
                if not path_ints: continue
                # Use internal cost calculation
                path_cost = self._calculate_path_cost(demand_id, path_ints)
                if path_cost == float('inf'): continue
                try:
                    last_id = path_ints[-1]; final_arrival = self.flight_int_to_data[last_id]['arr_minutes']
                    duration = self._calculate_time_diff_with_rollover(final_arrival, demand['minutes'])
                except (KeyError, IndexError): final_arrival = np.nan; duration = np.nan
                row = {"demand_id": demand_id, "demand_key": demand.get("key", demand_id),"path_rank": i + 1, "path_id": f"{demand_id}_p{i+1}","origin": demand["origin"], "destination": demand["destination"],"demand_kg": demand.get("demand", np.nan),"path_cost_minutes": int(path_cost) if not np.isnan(path_cost) else np.nan,"path_duration_minutes": int(duration) if not np.isnan(duration) else np.nan,"num_flights": len(path_ints),"final_arrival_minutes": int(final_arrival) if not np.isnan(final_arrival) else np.nan,"flight_int_ids": path_ints}
                data.append(row)
        if not data: print("No valid paths found to convert to DataFrame."); return pd.DataFrame()
        df = pd.DataFrame(data); cols_order = ["demand_id", "demand_key", "path_rank", "path_id", "origin", "destination","demand_kg", "path_cost_minutes", "path_duration_minutes", "num_flights","final_arrival_minutes", "flight_int_ids"]
        df = df[[col for col in cols_order if col in df.columns]]; print(f"DataFrame created with {len(df)} paths in {time.time() - start_convert_time:.2f}s"); return df


# =============================================
# Main Execution Block
# =============================================

def main():
    """Main function using sequential pathfinding."""
    overall_start_time = time.time()
    K_PATHS = 3; MIN_CONNECT_MINS = 60; MAX_CONNECT_MINS = 6 * 24 * 60
    CAPACITY_FILE = 'files/capacity.csv'; MARKET_FILE = 'files/market.csv'
    # MAX_WORKERS = None # No longer needed

    print("-" * 30)
    schedule_df = read_capacity(CAPACITY_FILE, airport_substitutions=same_airports)
    demand_df = read_market(MARKET_FILE, airport_substitutions=same_airports)
    print("-" * 30)

    if schedule_df.empty or demand_df.empty: return None, None

    print("Initializing Flight Network...")
    network = FlightNetwork(min_connection_minutes=MIN_CONNECT_MINS, max_connection_minutes=MAX_CONNECT_MINS)
    network.load_and_process_data(schedule_df, demand_df)
    print("-" * 30)

    # --- Find K Shortest Paths (Sequential) ---
    all_paths_dict_ints = network.find_all_k_shortest_paths_sequential(k=K_PATHS) # Call sequential method
    # Summary is printed inside the method
    print("-" * 30)

    # --- Process Results ---
    paths_df = network.get_paths_as_dataframe(all_paths_dict_ints)
    if not paths_df.empty:
        print("\nDataFrame Head (First 5 paths):")
        print(paths_df[['demand_id', 'path_rank', 'path_cost_minutes', 'path_duration_minutes', 'num_flights', 'flight_int_ids']].head())
    elif not all_paths_dict_ints: print("\nNo paths were found for any demand.")
    else: print("\nPaths dictionary generated, but DataFrame is empty.") # Should be caught by df empty check

    print(f"\nTotal execution time: {time.time() - overall_start_time:.2f} seconds")
    print("Final Network Stats:")
    for stat, value in network.stats.items():
        if isinstance(value, float): print(f"  {stat}: {value:.2f}s")
        else: print(f"  {stat}: {value}")
    print("-" * 30)
    return network, paths_df


if __name__ == "__main__":
    # Dummy file creation (Unchanged)
    if not os.path.exists('files'): os.makedirs('files')
    if not os.path.exists('files/capacity.csv'):
        pd.DataFrame({'Flight Number': ['FL100', 'FL101', 'FL102'], 'Orig': ['AAA', 'BBB', 'AAA'], 'Dest': ['BBB', 'CCC', 'BBB'],'Weekday_Z': ['Mon', 'Mon', 'Tue'], 'deptime': ['2024-01-01 10:00', '2024-01-01 14:00', '2024-01-02 09:00'],'arrtime': ['2024-01-01 12:00', '2024-01-01 16:00', '2024-01-02 11:00'],'Net Payload': [1000, 1000, 500], 'Net Volume': [10, 10, 5]}).to_csv('files/capacity.csv', index=False)
        print("Created dummy files/capacity.csv")
    if not os.path.exists('files/market.csv'):
         pd.DataFrame({'origin': ['AAA', 'AAA'], 'destination': ['CCC', 'BBB'], 'Market CHW': [100, 50],'Day': ['Mon', 'Tue'], 'Time': ['08:00', '07:00']}).to_csv('files/market.csv', index=False, sep=';')
         print("Created dummy files/market.csv")

    print("Starting K-Shortest Path Calculation (Sequential)...")
    network_obj, paths_df_result = main()

    if network_obj:
        print("\nScript finished.")
        if paths_df_result is not None and not paths_df_result.empty:
             print(f"Total paths generated in DataFrame: {len(paths_df_result)}")
        elif paths_df_result is not None:
             print("DataFrame was generated but is empty.")
        else:
             print("DataFrame generation failed.") # Indicates None was returned from main
    else:
        print("\nScript finished with errors (likely during data loading or network init).")

Starting K-Shortest Path Calculation (Sequential)...
------------------------------
Capacity data read: 2057 rows.
Market data read: 3185 rows.
------------------------------
Initializing Flight Network...
Processing flight schedule...
Schedule processed (0.04s)
Processing demand data...
Demand processed (0.04s)
Data loaded. Network: 2057 flights (int IDs), 109 airports, 3185 demands.
Building connectivity graph (using integer IDs)...
Connectivity graph built (0.05s)
  80665 flight-to-flight connections
  189207 source-to-flight connections
  All demands have at least one potential initial flight connection.
------------------------------

Finding up to 3 shortest paths for 3185 demands (sequentially)...
  Processed 159/3185 demands... (0.31s)
  Processed 318/3185 demands... (0.99s)
  Processed 477/3185 demands... (1.75s)
  Processed 636/3185 demands... (2.20s)
  Processed 795/3185 demands... (2.92s)
  Processed 954/3185 demands... (3.51s)
  Processed 1113/3185 demands... (4.07s)
  Pro

In [5]:
paths_df_result

Unnamed: 0,demand_id,demand_key,path_rank,path_id,origin,destination,demand_kg,path_cost_minutes,path_duration_minutes,num_flights,final_arrival_minutes,flight_int_ids
0,AMS/LAX/1080,AMS/LAX/1080,1,AMS/LAX/1080_p1,AMS,LAX,4366.625,18753,9618,3,618,"[34, 1356, 508]"
1,AMS/LAX/1080,AMS/LAX/1080,2,AMS/LAX/1080_p2,AMS,LAX,4366.625,18960,9830,3,830,"[34, 1356, 524]"
2,AMS/LAX/1080,AMS/LAX/1080,3,AMS/LAX/1080_p3,AMS,LAX,4366.625,19625,410,3,1490,"[34, 1356, 496]"
3,AMS/LAX/2520,AMS/LAX/2520,1,AMS/LAX/2520_p1,AMS,LAX,2481.625,17313,8178,3,618,"[34, 1356, 508]"
4,AMS/LAX/2520,AMS/LAX/2520,2,AMS/LAX/2520_p2,AMS,LAX,2481.625,17520,8390,3,830,"[34, 1356, 524]"
...,...,...,...,...,...,...,...,...,...,...,...,...
8111,YHM/VIT/8280,YHM/VIT/8280,2,YHM/VIT/8280_p2,YHM,VIT,3.400,12810,3490,3,1690,"[2042, 634, 1343]"
8112,YHM/VIT/8280,YHM/VIT/8280,3,YHM/VIT/8280_p3,YHM,VIT,3.400,12810,3490,3,1690,"[2042, 633, 1343]"
8113,YHM/VIT/9720,YHM/VIT/9720,1,YHM/VIT/9720_p1,YHM,VIT,9.600,11370,2050,3,1690,"[2042, 632, 1343]"
8114,YHM/VIT/9720,YHM/VIT/9720,2,YHM/VIT/9720_p2,YHM,VIT,9.600,11370,2050,3,1690,"[2042, 634, 1343]"


1. The time calculations are incorrect, make sure to not use days and time but just the minutes after start, where day 0 time 0 is the start...

# Nicelly working

In [15]:
def read_capacity():
    capacity = pd.read_csv('files/capacity.csv', sep=';')
    # turn STD-HH:MM into dep_time datetime, and START-HH:MM into arr_time datetime , datetime64[ns]!!!
    capacity['dep_time'] = capacity['STD-HH:MM'].str.replace(':', '').astype(int)
    capacity['arr_time'] = capacity['STA-HH:MM'].str.replace(':', '').astype(int)
    # convert Weekday_Z into Day which is just a number, so turn Mon into 0, Tue into 1, ..., Sun into 6
    capacity['day'] = capacity['Weekday_Z'].map({'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6})
    capacity['date_time'] = capacity['day'].astype(str) + '/' + capacity['dep_time'].astype(str)
    # delete the columns that are not needed, reorder the columns
    rename_columns = {'Net Payload': 'cap_kg', 'Net Volume': 'cap_m3', 'Orig':'ori', 'Dest':'des', 'Flight Number':'flight_number', 'A/C':'aircraft_type'}
    capacity = capacity.rename(columns=rename_columns)
    columns = ['flight_number','ori','des','aircraft_type' ,'dep_time', 'arr_time', 'day', 'date_time','cap_kg', 'cap_m3', 'key']
    capacity['key'] = capacity['ori'] + '/'+ capacity['des'] +'/'+ capacity['date_time']
    capacity = capacity[columns]
    return capacity

def read_market():
    market = pd.read_csv('files/market.csv', sep=';')
    market = market.rename(columns={'origin':'ori', 'destination':'des', 'Market CHW': 'demand', 'Day':'day', 'Time':'time'})
    market.drop(columns=['Market Allin Yield','product'], inplace=True)
    # turn time into just a 4 digit number
    market['time'] = market['time'].str.replace(':', '').astype(int)
    market['day'] = market['day'].map({'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6})
    market['date_time'] = market['day'].astype(str) + '/' + market['time'].astype(str)
    market['key'] = market['ori'] + '/'+ market['des'] +'/'+ market['date_time']
    return market

In [17]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
import heapq
import time
from itertools import islice

class FlightNetwork:
    """
    An optimized representation of the flight network for efficient path finding.
    """

    def __init__(self, connection_window_hours=48, min_connection_minutes=60):
        """
        Initialize the flight network.
        
        Args:
            connection_window_hours: Maximum time window for connections in hours
            min_connection_minutes: Minimum connection time in minutes
        """
        self.connection_window_minutes = connection_window_hours * 60
        self.min_connection_minutes = min_connection_minutes
        
        # Core data structures
        self.flights = {}  # flight_id -> flight_data
        self.airports = set()  # Set of all airports
        self.demands = {}  # demand_id -> demand_data
        
        # Index structures for fast lookups
        self.flights_by_origin = defaultdict(list)
        self.flights_by_destination = defaultdict(list)
        self.flights_by_day_time = defaultdict(list)  # (day, time) -> [flight_ids]
        
        # Path cache to avoid recomputing
        self.path_cache = {}
        
        # Convert time to total minutes for faster comparison
        self.time_to_minutes_cache = {}
    
    def time_to_minutes(self, day, time):
        """
        Convert day and time to total minutes since start of week.
        Cache results for better performance.
        """
        key = (day, time)
        if key not in self.time_to_minutes_cache:
            # Convert time (HHMM) to hours and minutes
            hours = time // 100
            minutes = time % 100
            
            # Total minutes since start of week
            self.time_to_minutes_cache[key] = (day * 24 * 60) + (hours * 60) + minutes
            
        return self.time_to_minutes_cache[key]
    
    def load_data(self, schedule_df, demand_df):
        """
        Load and preprocess the schedule and demand data efficiently.
        """
        start_time = time.time()
        
        # Process schedule data
        for _, flight in schedule_df.iterrows():
            flight_id = f"{flight['flight_number']}_{flight['day']}_{flight['dep_time']}"
            
            # Precompute flight times in minutes for faster comparisons
            dep_minutes = self.time_to_minutes(flight['day'], flight['dep_time'])
            arr_minutes = self.time_to_minutes(flight['day'], flight['arr_time'])
            
            # Handle overnight flights (arrival time < departure time)
            if arr_minutes < dep_minutes:
                arr_minutes += 24 * 60  # Add a day's worth of minutes
            
            flight_data = {
                'id': flight_id,
                'flight_number': flight['flight_number'],
                'origin': flight['ori'],
                'destination': flight['des'],
                'day': flight['day'],
                'dep_time': flight['dep_time'],
                'arr_time': flight['arr_time'],
                'dep_minutes': dep_minutes,
                'arr_minutes': arr_minutes,
                'capacity_kg': flight['cap_kg'],
                'capacity_m3': flight['cap_m3']
            }
            
            self.flights[flight_id] = flight_data
            self.flights_by_origin[flight['ori']].append(flight_id)
            self.flights_by_destination[flight['des']].append(flight_id)
            self.flights_by_day_time[(flight['day'], flight['dep_time'])].append(flight_id)
            
            self.airports.add(flight['ori'])
            self.airports.add(flight['des'])
        
        # Sort flight lists by departure time for faster sequential access
        for airport in self.flights_by_origin:
            self.flights_by_origin[airport].sort(
                key=lambda f_id: self.flights[f_id]['dep_minutes']
            )
        
        for airport in self.flights_by_destination:
            self.flights_by_destination[airport].sort(
                key=lambda f_id: self.flights[f_id]['arr_minutes']
            )
        
        # Process demand data
        for _, demand in demand_df.iterrows():
            demand_id = f"{demand['ori']}_{demand['des']}_{demand['day']}_{demand['time']}"
            
            demand_minutes = self.time_to_minutes(demand['day'], demand['time'])
            
            demand_data = {
                'id': demand_id,
                'origin': demand['ori'],
                'destination': demand['des'],
                'day': demand['day'],
                'time': demand['time'],
                'minutes': demand_minutes,
                'demand': demand['demand'],
                'key': demand['key']
            }
            
            self.demands[demand_id] = demand_data
        
        print(f"Data loaded and indexed in {time.time() - start_time:.2f} seconds")
        print(f"Network has {len(self.flights)} flights and {len(self.demands)} demands across {len(self.airports)} airports")
    
    def build_adjacency_lists(self):
        """
        Build optimized adjacency lists for fast path finding.
        This precomputes valid connections between flights.
        """
        start_time = time.time()
        
        # Initialize adjacency lists
        self.outgoing_edges = defaultdict(list)
        self.source_connections = defaultdict(list)
        self.sink_connections = defaultdict(dict)
        
        # Build flight-to-flight connections efficiently
        # Group flights by destination for faster processing
        for origin_airport in self.flights_by_origin:
            # Get all flights departing from this origin
            arriving_flights = set()
            
            for flight1_id in self.flights_by_destination[origin_airport]:
                flight1 = self.flights[flight1_id]
                arriving_flights.add(flight1_id)
                
                # Find potential connecting flights efficiently
                # Only consider flights that depart after flight1 arrives + min connection time
                min_dep_minutes = flight1['arr_minutes'] + self.min_connection_minutes
                max_dep_minutes = flight1['arr_minutes'] + self.connection_window_minutes
                
                for flight2_id in self.flights_by_origin[origin_airport]:
                    flight2 = self.flights[flight2_id]
                    
                    # Fast time comparison using precomputed minutes
                    if min_dep_minutes <= flight2['dep_minutes'] <= max_dep_minutes:
                        wait_time = flight2['dep_minutes'] - flight1['arr_minutes']
                        
                        # Add connection to adjacency list
                        self.outgoing_edges[flight1_id].append((flight2_id, wait_time))

        # Build source-to-flight connections (demand to flight)
        for demand_id, demand in self.demands.items():
            origin = demand['origin']
            dest = demand['destination']
            demand_minutes = demand['minutes']
            
            # Optimization: Only consider flights from the same origin
            for flight_id in self.flights_by_origin[origin]:
                flight = self.flights[flight_id]
                
                # Check if flight departs after demand time but within window
                if demand_minutes <= flight['dep_minutes'] <= demand_minutes + self.connection_window_minutes:
                    wait_time = flight['dep_minutes'] - demand_minutes
                    self.source_connections[demand_id].append((flight_id, wait_time))
            
            # Track sink connections for this demand
            self.sink_connections[demand_id] = dest
        
        print(f"Adjacency lists built in {time.time() - start_time:.2f} seconds")
        
        # Analyze connectivity
        total_connections = sum(len(edges) for edges in self.outgoing_edges.values())
        total_source_connections = sum(len(conns) for conns in self.source_connections.values())
        
        print(f"Network has {total_connections} flight-to-flight connections")
        print(f"Network has {total_source_connections} source-to-flight connections")
        
        # Check for sources with no connections
        disconnected_sources = [d_id for d_id in self.demands if not self.source_connections[d_id]]
        print(f"Found {len(disconnected_sources)} disconnected sources ({len(disconnected_sources)/len(self.demands):.1%} of total)")
    
    def find_shortest_path(self, demand_id):
        """
        Find the shortest path for a given demand using optimized Dijkstra's algorithm.
        """
        if demand_id not in self.demands:
            return None
        
        # Check cache first
        if demand_id in self.path_cache:
            return self.path_cache[demand_id]
        
        demand = self.demands[demand_id]
        target_airport = demand['destination']
        
        # No outgoing connections for this source
        if not self.source_connections[demand_id]:
            return None
        
        # Initialize Dijkstra's algorithm
        distances = {demand_id: 0}  # Source to itself has distance 0
        previous = {}
        pq = [(0, demand_id)]  # Priority queue (min-heap) of (distance, node)
        visited = set()
        found_sink = False
        
        while pq and not found_sink:
            dist, current = heapq.heappop(pq)
            
            if current in visited:
                continue
            
            visited.add(current)
            
            # If we've reached a flight going to the target destination, we're done
            if current in self.flights:
                flight = self.flights[current]
                if flight['destination'] == target_airport:
                    found_sink = True
                    break
            
            # Process outgoing edges
            if current == demand_id:
                # Source connections
                edges = self.source_connections[current]
            elif current in self.flights:
                # Flight-to-flight connections
                edges = self.outgoing_edges[current]
            else:
                edges = []
            
            for neighbor, weight in edges:
                if neighbor in visited:
                    continue
                
                alt = distances[current] + weight
                if neighbor not in distances or alt < distances[neighbor]:
                    distances[neighbor] = alt
                    previous[neighbor] = current
                    heapq.heappush(pq, (alt, neighbor))
        
        # Reconstruct path if we found one
        if found_sink:
            # Find the flight to target destination with minimum total distance
            min_dist = float('inf')
            end_node = None
            
            for node in visited:
                if node in self.flights and self.flights[node]['destination'] == target_airport:
                    if distances[node] < min_dist:
                        min_dist = distances[node]
                        end_node = node
            
            path = []
            while end_node:
                path.append(end_node)
                end_node = previous.get(end_node)
            
            path.reverse()  # Reverse to get source-to-sink order
            
            # Cache the result
            self.path_cache[demand_id] = path
            return path
        
        # No path found
        self.path_cache[demand_id] = None
        return None
    
    def find_k_shortest_paths(self, demand_id, k=3):
        """
        Find up to k shortest paths for a demand using Yen's algorithm.
        This is more efficient than NetworkX's implementation for large networks.
        """
        if demand_id not in self.demands:
            return []
        
        demand = self.demands[demand_id]
        target_airport = demand['destination']
        
        # Find first shortest path
        A = [self.find_shortest_path(demand_id)]
        if not A[0]:
            return []
        
        # Initialize potential paths
        B = []
        
        # Find k-1 more paths
        for k_idx in range(1, k):
            # Previous path
            prev_path = A[-1]
            
            # Try deviation at each node in the previous path (except the last)
            for i in range(len(prev_path) - 1):
                spur_node = prev_path[i]
                root_path = prev_path[:i+1]
                
                # Remove edges in root_path from graph temporarily
                excluded_edges = set()
                for path_idx in range(len(A)):
                    curr_path = A[path_idx]
                    if len(curr_path) > i + 1 and curr_path[:i+1] == root_path:
                        # Exclude this edge to find alternative path
                        excluded_edges.add((curr_path[i], curr_path[i+1]))
                
                # Find shortest path from spur node to sink with excluded edges
                spur_path = self._find_spur_path(spur_node, target_airport, excluded_edges)
                if not spur_path:
                    continue
                
                # Build the complete path
                complete_path = root_path + spur_path[1:]
                
                # Add to candidates if not already there
                if complete_path not in B:
                    # Calculate path length (sum of wait times)
                    path_length = self._calculate_path_length(complete_path)
                    heapq.heappush(B, (path_length, complete_path))
            
            # No more candidates
            if not B:
                break
            
            # Add the next best path to A
            _, next_path = heapq.heappop(B)
            A.append(next_path)
        
        return A
    
    def _find_spur_path(self, start_node, target_airport, excluded_edges):
        """Helper method for Yen's k-shortest paths algorithm"""
        # This is a simplified version - in a production system, you'd want to 
        # make a temporary copy of the graph excluding certain edges
        
        # For our purposes, we'll use a modified Dijkstra that avoids excluded edges
        distances = {start_node: 0}
        previous = {}
        pq = [(0, start_node)]
        visited = set()
        found_sink = False
        
        while pq and not found_sink:
            dist, current = heapq.heappop(pq)
            
            if current in visited:
                continue
            
            visited.add(current)
            
            # If we've reached a flight going to the target destination, we're done
            if current in self.flights and self.flights[current]['destination'] == target_airport:
                found_sink = True
                break
            
            # Process outgoing edges
            if current == start_node and start_node in self.demands:
                # Start node is a demand source
                edges = self.source_connections[current]
            elif current in self.flights:
                # Flight-to-flight connections
                edges = self.outgoing_edges[current]
            else:
                edges = []
            
            for neighbor, weight in edges:
                # Skip excluded edges
                if (current, neighbor) in excluded_edges:
                    continue
                    
                if neighbor in visited:
                    continue
                
                alt = distances[current] + weight
                if neighbor not in distances or alt < distances[neighbor]:
                    distances[neighbor] = alt
                    previous[neighbor] = current
                    heapq.heappush(pq, (alt, neighbor))
        
        # Reconstruct path if we found one
        if found_sink:
            # Find the flight to target destination with minimum total distance
            min_dist = float('inf')
            end_node = None
            
            for node in visited:
                if node in self.flights and self.flights[node]['destination'] == target_airport:
                    if distances[node] < min_dist:
                        min_dist = distances[node]
                        end_node = node
            
            path = []
            while end_node:
                path.append(end_node)
                end_node = previous.get(end_node)
            
            path.reverse()  # Reverse to get source-to-sink order
            return path
        
        return None
    
    def _calculate_path_length(self, path):
        """Calculate the total length (wait time) of a path"""
        if not path or len(path) < 2:
            return float('inf')
            
        total_length = 0
        for i in range(len(path) - 1):
            current, next_node = path[i], path[i+1]
            
            # Find the edge weight between these nodes
            if current in self.demands:
                # Source to flight
                for neighbor, weight in self.source_connections[current]:
                    if neighbor == next_node:
                        total_length += weight
                        break
            else:
                # Flight to flight
                for neighbor, weight in self.outgoing_edges[current]:
                    if neighbor == next_node:
                        total_length += weight
                        break
        
        return total_length
    
    def find_all_paths(self, k=3):
        """
        Find k shortest paths for all demands.
        Returns a dictionary {demand_id: [path1, path2, ...]}
        """
        start_time = time.time()
        
        results = {}
        for i, (demand_id, demand) in enumerate(self.demands.items()):
            paths = self.find_k_shortest_paths(demand_id, k)
            results[demand_id] = paths
            
            # Print progress every 100 demands
            if (i + 1) % 100 == 0:
                elapsed = time.time() - start_time
                print(f"Processed {i+1}/{len(self.demands)} demands in {elapsed:.2f}s ({(i+1)/elapsed:.1f} demands/s)")
        
        total_time = time.time() - start_time
        avg_time = total_time / len(self.demands) if self.demands else 0
        
        print(f"Found paths for all {len(self.demands)} demands in {total_time:.2f}s")
        print(f"Average time: {avg_time*1000:.2f}ms per demand")
        
        # Calculate some statistics
        demands_with_paths = sum(1 for paths in results.values() if paths)
        avg_paths_per_demand = sum(len(paths) for paths in results.values()) / len(self.demands)
        
        print(f"{demands_with_paths}/{len(self.demands)} demands have at least one path ({demands_with_paths/len(self.demands):.1%})")
        print(f"Average {avg_paths_per_demand:.2f} paths per demand")
        
        return results
    
    def format_path(self, demand_id, path):
        """Format a path for human-readable display"""
        if not path:
            return "No path found"
        
        result = []
        demand = self.demands[demand_id]
        
        # First node is the source
        result.append(f"Source: {demand['origin']} → {demand['destination']} (Day {demand['day']}, Time {demand['time']})")
        
        # Rest are flights
        for node in path:
            if node in self.flights:
                flight = self.flights[node]
                result.append(f"Flight: {flight['flight_number']} from {flight['origin']} → {flight['destination']} " +
                              f"(Day {flight['day']}, Dep {flight['dep_time']}, Arr {flight['arr_time']})")
        
        return "\n".join(result)





In [18]:
def benchmark_path_finding(network, sample_size=100):
    """Benchmark the path finding performance"""
    import random
    
    # Sample random demands
    if len(network.demands) <= sample_size:
        demand_sample = list(network.demands.keys())
    else:
        demand_sample = random.sample(list(network.demands.keys()), sample_size)
    
    print(f"Benchmarking path finding with {len(demand_sample)} random demands...")
    
    # Benchmark single path finding
    start_time = time.time()
    paths_found = 0
    
    for demand_id in demand_sample:
        path = network.find_shortest_path(demand_id)
        if path:
            paths_found += 1
    
    single_path_time = time.time() - start_time
    print(f"Found {paths_found}/{len(demand_sample)} single shortest paths in {single_path_time:.2f}s")
    print(f"Average: {single_path_time*1000/len(demand_sample):.2f}ms per demand")
    
    # Benchmark k-shortest paths
    k = 3
    start_time = time.time()
    total_paths = 0
    
    for demand_id in demand_sample:
        paths = network.find_k_shortest_paths(demand_id, k)
        total_paths += len(paths)
    
    k_paths_time = time.time() - start_time
    print(f"Found {total_paths} paths (max {k} per demand) in {k_paths_time:.2f}s")
    print(f"Average: {k_paths_time*1000/len(demand_sample):.2f}ms per demand")

In [21]:
def main():
    start_time = time.time()
    
    # Read data
    schedule_df = read_capacity()
    demand_df = read_market()
    
    print(f"Data loaded in {time.time() - start_time:.2f}s")
    print(f"Schedule has {len(schedule_df)} flights")
    print(f"Demand has {len(demand_df)} entries")
    
    # Create and populate our optimized network structure
    network = FlightNetwork(connection_window_hours=48, min_connection_minutes=60)
    network.load_data(schedule_df, demand_df)
    network.build_adjacency_lists()
    
    # Benchmark path finding performance
    benchmark_path_finding(network, sample_size=min(100, len(network.demands)))
    
    # Sample some demands and find their paths
    sample_demands = list(islice(network.demands.items(), 5))
    
    for demand_id, demand in sample_demands:
        print(f"\nFinding paths for demand {demand['key']} ({demand['origin']} → {demand['destination']}, {demand['demand']} kg)")
        
        # Find k shortest paths
        paths = network.find_k_shortest_paths(demand_id, k=3)
        
        print(f"Found {len(paths)} paths:")
        for i, path in enumerate(paths):
            print(f"\nPath {i+1}:")
            print(network.format_path(demand_id, path))
    
    print(f"\nTotal execution time: {time.time() - start_time:.2f}s")
    
    return network, schedule_df, demand_df

if __name__ == "__main__":
    network, schedule_df, demand_df = main()

Data loaded in 0.01s
Schedule has 1731 flights
Demand has 3689 entries
Data loaded and indexed in 0.10 seconds
Network has 1731 flights and 3689 demands across 102 airports
Adjacency lists built in 0.02 seconds
Network has 37197 flight-to-flight connections
Network has 44845 source-to-flight connections
Found 417 disconnected sources (11.3% of total)
Benchmarking path finding with 100 random demands...
Found 58/100 single shortest paths in 0.02s
Average: 0.20ms per demand
Found 170 paths (max 3 per demand) in 0.18s
Average: 1.78ms per demand

Finding paths for demand DXB/SIN/0/1800 (DXB → SIN, 3004.1475 kg)
Found 3 paths:

Path 1:
Source: DXB → SIN (Day 0, Time 1800)
Flight: ES262 from DXB → BAH (Day 0, Dep 1900, Arr 2025)
Flight: ES265 from BAH → DXB (Day 0, Dep 2225, Arr 2350)
Flight: ES547 from DXB → LHE (Day 1, Dep 110, Arr 415)
Flight: ES548 from LHE → BAH (Day 1, Dep 545, Arr 1010)
Flight: ES783 from BAH → SIN (Day 1, Dep 1225, Arr 2040)

Path 2:
Source: DXB → SIN (Day 0, Time 18

# OLD

I have a schedule and demand for intercontinetal flights, now I want to develop a LP program that assigns optimally flow on some paths that i create. Hence I need to develop algorithms for finding such diverse paths to transport the goods in the network. But first I need a solid representation to base those calculations on. Fot that I want to construct a graph from data in schedule and demand files, whose snippets I ahve provided below.

In [14]:
# create a source vertex for each demand start - time specific
# create a sink vertex for each demand end - not time specific, we shall register last flight arrival as end time   
# create a vertex for each flight - time specific

# create an arc between every flight vertex if arr(f1)<dep(f2) - 60min, and des(f1)==ori(f2)
# create an arc from every source to every flight if dep(f1)>=start(s1) and ori(f1)==airport(s1)
# create an arc from every flight to sink if des(f1)==airport(sink)

# PROBLEMS:
# 1. A LOT OF UNUSED EDGES, WHICH ARE USELESS - DAY 1 FLIGHT CAN TRANISITION TO DAY 7 FLIGHT - STUPID, UNREALISTIC - 2 day time window??
# 2. CREATES LOTS OF POTENTIAL PATHS

In [12]:
import networkx as nx
from datetime import datetime, timedelta
from collections import defaultdict
import matplotlib.pyplot as plt 
import random

In [11]:
def build_time_expanded_network(schedule_df, demand_df, connection_window_hours=48):
    """
    Build a time-expanded network for flight scheduling optimization.
    
    Args:
        schedule_df: DataFrame with flight schedule information
        demand_df: DataFrame with demand information
        connection_window_hours: Maximum time window for connections in hours
    
    Returns:
        G: NetworkX DiGraph object representing the time-expanded network
    """
    # Initialize directed graph
    G = nx.DiGraph()
    
    # Create a helper function to calculate time difference considering day wrap
    def time_diff_minutes(day1, time1, day2, time2):
        """Calculate difference in minutes between two times on specific days"""
        # Convert times to minutes since beginning of week
        mins1 = day1 * 24 * 60 + (time1 // 100) * 60 + (time1 % 100)
        mins2 = day2 * 24 * 60 + (time2 // 100) * 60 + (time2 % 100)
        
        # Handle week wrap (if needed)
        if mins2 < mins1:
            mins2 += 7 * 24 * 60  # Add a week's worth of minutes
            
        return mins2 - mins1
    
    # Calculate maximum connection window in minutes
    max_connection_minutes = connection_window_hours * 60
    
    # Step 1: Create flight vertices
    flight_nodes = []
    for _, flight in schedule_df.iterrows():
        flight_node = f"FLIGHT_{flight['flight_number']}_{flight['day']}_{flight['dep_time']}"
        flight_nodes.append(flight_node)
        G.add_node(
            flight_node, 
            type='flight',
            flight_number=flight['flight_number'],
            origin=flight['ori'],
            destination=flight['des'],
            day=flight['day'],
            dep_time=flight['dep_time'],
            arr_time=flight['arr_time'],
            capacity_kg=flight['cap_kg'],
            capacity_m3=flight['cap_m3']
        )
    
    # Step 2: Create source vertices for each demand start - time specific
    source_nodes = []
    for _, demand in demand_df.iterrows():
        source_node = f"SOURCE_{demand['ori']}_{demand['des']}_{demand['day']}_{demand['time']}"
        source_nodes.append(source_node)
        G.add_node(
            source_node, 
            type='source',
            origin=demand['ori'],
            destination=demand['des'],
            day=demand['day'],
            time=demand['time'],
            demand=demand['demand']
        )
    
    # Step 3: Create sink vertices for each destination airport
    sink_nodes = []
    unique_destinations = demand_df['des'].unique()
    for dest in unique_destinations:
        sink_node = f"SINK_{dest}"
        sink_nodes.append(sink_node)
        G.add_node(sink_node, type='sink', airport=dest)
    
    # Step 4: Create arcs between flights (f1 to f2) if arr(f1) + 60min ≤ dep(f2) and des(f1) = ori(f2)
    for i, flight1 in schedule_df.iterrows():
        f1_node = f"FLIGHT_{flight1['flight_number']}_{flight1['day']}_{flight1['dep_time']}"
        
        for j, flight2 in schedule_df.iterrows():
            if i != j:  # Don't connect a flight to itself
                # Check if des(f1) == ori(f2)
                if flight1['des'] == flight2['ori']:
                    # Calculate time difference, allowing connections across days
                    time_diff = time_diff_minutes(
                        flight1['day'], flight1['arr_time'],
                        flight2['day'], flight2['dep_time']
                    )
                    
                    # Minimum connection time: 60 minutes
                    # Maximum connection time: connection_window_hours
                    if 60 <= time_diff <= max_connection_minutes:
                        f2_node = f"FLIGHT_{flight2['flight_number']}_{flight2['day']}_{flight2['dep_time']}"
                        G.add_edge(
                            f1_node, f2_node,
                            type='connection',
                            wait_time=time_diff
                        )
    
    # Step 5: Connect sources to flights
    for _, demand in demand_df.iterrows():
        source_node = f"SOURCE_{demand['ori']}_{demand['des']}_{demand['day']}_{demand['time']}"
        
        for _, flight in schedule_df.iterrows():
            # Check if flight departs from the demand origin
            if flight['ori'] == demand['ori']:
                # Check if flight departure time is after demand time
                time_diff = time_diff_minutes(
                    demand['day'], demand['time'],
                    flight['day'], flight['dep_time']
                )
                
                # Flight must depart after demand time, but within connection window
                if 0 <= time_diff <= max_connection_minutes:
                    flight_node = f"FLIGHT_{flight['flight_number']}_{flight['day']}_{flight['dep_time']}"
                    G.add_edge(
                        source_node, flight_node,
                        type='source_to_flight',
                        wait_time=time_diff
                    )
    
    # Step 6: Connect flights to sinks
    for _, flight in schedule_df.iterrows():
        flight_node = f"FLIGHT_{flight['flight_number']}_{flight['day']}_{flight['dep_time']}"
        sink_node = f"SINK_{flight['des']}"
        
        G.add_edge(
            flight_node, sink_node,
            type='flight_to_sink',
            arrival_time=flight['arr_time'],
            arrival_day=flight['day']
        )
    
    return G

def analyze_graph(G):
    """Analyze the graph structure and print useful statistics"""
    print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    
    # Count nodes by type
    node_types = {}
    for node, data in G.nodes(data=True):
        node_type = data.get('type', 'unknown')
        node_types[node_type] = node_types.get(node_type, 0) + 1
    
    print("\nNode counts by type:")
    for node_type, count in node_types.items():
        print(f"  {node_type}: {count}")
    
    # Count edges by type
    edge_types = {}
    for u, v, data in G.edges(data=True):
        edge_type = data.get('type', 'unknown')
        edge_types[edge_type] = edge_types.get(edge_type, 0) + 1
    
    print("\nEdge counts by type:")
    for edge_type, count in edge_types.items():
        print(f"  {edge_type}: {count}")
        
    # Check for any sources that have no outgoing paths
    isolated_sources = 0
    for node, data in G.nodes(data=True):
        if data.get('type') == 'source' and G.out_degree(node) == 0:
            isolated_sources += 1
    
    if isolated_sources > 0:
        print(f"\nWARNING: {isolated_sources} source nodes have no outgoing paths")
    
    # Check for connectivity between sources and sinks
    sources = [n for n, d in G.nodes(data=True) if d.get('type') == 'source']
    sinks = [n for n, d in G.nodes(data=True) if d.get('type') == 'sink']
    
    # Sample a few paths to check connectivity
    print("\nChecking connectivity between sources and sinks...")
    sampled_sources = sources[:min(5, len(sources))]
    sampled_sinks = sinks[:min(5, len(sinks))]
    
    paths_found = 0
    paths_attempted = 0
    
    for source in sampled_sources:
        source_data = G.nodes[source]
        # Find sinks that match the demand destination
        target_sinks = [sink for sink in sinks if G.nodes[sink]['airport'] == source_data['destination']]
        
        if target_sinks:
            target_sink = target_sinks[0]
            paths_attempted += 1
            try:
                path = nx.shortest_path(G, source, target_sink)
                paths_found += 1
                print(f"  Path exists from {source} to {target_sink} with {len(path)-2} intermediate flights")
            except nx.NetworkXNoPath:
                print(f"  No path from {source} to {target_sink}")
    
    if paths_attempted > 0:
        print(f"\n{paths_found} out of {paths_attempted} sampled source-sink pairs have valid paths")
    
    return node_types, edge_types

def find_paths_for_demand(G, demand_row, max_paths=3):
    """Find multiple paths for a specific demand"""
    source_node = f"SOURCE_{demand_row['ori']}_{demand_row['des']}_{demand_row['day']}_{demand_row['time']}"
    sink_node = f"SINK_{demand_row['des']}"
    
    if source_node not in G or sink_node not in G:
        print(f"Source or sink node not found for demand {demand_row['key']}")
        return []
    
    paths = []
    try:
        # Try to find k simple paths
        for i, path in enumerate(nx.shortest_simple_paths(G, source_node, sink_node)):
            if i >= max_paths:
                break
            paths.append(path)
            
            # Print path details
            print(f"\nPath {i+1} for demand {demand_row['key']} (demand: {demand_row['demand']} kg):")
            for j in range(len(path)):
                node = path[j]
                node_data = G.nodes[node]
                
                if node_data['type'] == 'source':
                    print(f"  Source: {node_data['origin']} → {node_data['destination']} (Day {node_data['day']}, Time {node_data['time']})")
                elif node_data['type'] == 'flight':
                    print(f"  Flight: {node_data['flight_number']} from {node_data['origin']} → {node_data['destination']} (Day {node_data['day']}, Dep {node_data['dep_time']}, Arr {node_data['arr_time']})")
                elif node_data['type'] == 'sink':
                    print(f"  Sink: {node_data['airport']}")
                
                # If not the last node, print edge details
                if j < len(path) - 1:
                    edge_data = G.edges[path[j], path[j+1]]
                    if edge_data['type'] == 'connection':
                        print(f"    → Connection wait time: {edge_data['wait_time']} minutes")
                    elif edge_data['type'] == 'source_to_flight':
                        print(f"    → Wait time before flight: {edge_data['wait_time']} minutes")
    
    except nx.NetworkXNoPath:
        print(f"No path found for demand {demand_row['key']}")
    
    return paths

def main():
    # Read data
    schedule_df = read_capacity()
    demand_df = read_market()
    
    print(f"Loaded {len(schedule_df)} flights and {len(demand_df)} demand entries")
    
    # Build the time-expanded network with 48-hour connection window
    G = build_time_expanded_network(schedule_df, demand_df, connection_window_hours=48)
    
    # Analyze the graph
    node_types, edge_types = analyze_graph(G)
    
    # Sample a few demands and find paths for them
    sample_demands = demand_df.sample(min(3, len(demand_df)))
    for _, demand in sample_demands.iterrows():
        paths = find_paths_for_demand(G, demand, max_paths=2)
        print(f"Found {len(paths)} paths for demand {demand['key']}")
    
    # Optional: Save the graph for future use
    # import pickle
    # with open('flight_network.pickle', 'wb') as f:
    #     pickle.dump(G, f)
    
    return G, schedule_df, demand_df

if __name__ == "__main__":
    G, schedule_df, demand_df = main()

Loaded 1731 flights and 3689 demand entries
Graph has 5524 nodes and 95465 edges

Node counts by type:
  flight: 1731
  source: 3689
  sink: 80
  unknown: 24

Edge counts by type:
  connection: 42030
  flight_to_sink: 1731
  source_to_flight: 51704


Checking connectivity between sources and sinks...
  Path exists from SOURCE_DXB_SIN_0_1800 to SINK_SIN with 2 intermediate flights
  Path exists from SOURCE_DXB_EMA_0_1800 to SINK_EMA with 2 intermediate flights
  Path exists from SOURCE_DXB_CGN_0_1800 to SINK_CGN with 3 intermediate flights
  Path exists from SOURCE_CVG_CPH_0_1800 to SINK_CPH with 2 intermediate flights
  Path exists from SOURCE_BLR_VIT_0_1800 to SINK_VIT with 2 intermediate flights

5 out of 5 sampled source-sink pairs have valid paths

Path 1 for demand SZX/CGN/3/1800 (demand: 2175.79125 kg):
  Source: SZX → CGN (Day 3, Time 1800)
    → Wait time before flight: 790 minutes
  Flight: I98849 from SZX → LEJ (Day 4, Dep 710, Arr 1930)
    → Connection wait time: 2395 minut