In [1]:
import time
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import re
from pathlib import Path

In [2]:
def simulate_estimated_delay_backward(
    minutes_to_departure,
    final_delay,
    reveal_bias,
    noise_rng
):
    # No delay shown if no final delay
    if final_delay <= 0:
        return 0.0

    # >6h out → nothing
    if minutes_to_departure > 360:
        return 0.0

    # Progress: 6h out -> 0, departure -> 1
    progress = 1 - (minutes_to_departure / 360)
    progress = np.clip(progress, 0, 1)

    # Small delays revealed late, big ones earlier
    if final_delay <= 15:
        base_power = 4.0
    elif final_delay <= 30:
        base_power = 2.5
    else:
        base_power = 1.5

    reveal_power = np.clip(base_power + reveal_bias, 1.2, 5.0)

    revealed_fraction = progress ** reveal_power
    expected = final_delay * revealed_fraction

    # Fading noise (cannot break constraints)
    noise_scale = (1 - progress) * min(3, final_delay * 0.1)
    noise = noise_rng.normal(0, noise_scale)

    estimate = expected + noise

    # Past planned departure → must reflect elapsed delay
    if minutes_to_departure < 0:
        estimate = max(estimate, abs(minutes_to_departure))

    # Hard bounds
    return float(np.clip(estimate, 0, final_delay))


In [None]:
def save_daily_snapshots(df, output_dir="flights", output_file="flights_"):
    """Simulate historical snapshots and save to a single file"""
    
    os.makedirs(output_dir, exist_ok=True)
    output_dir = Path(output_dir)
    
    df = df.copy()
    df["Planned_DT"] = pd.to_datetime(df["Planned_Dep_Timestamp"], unit="s")
    df["Actual_DT"] = pd.to_datetime(df["Actual_Dep_Timestamp"], unit="s", errors="coerce")
    
    # Create list to store all snapshot data
    all_snapshots = []
    
    # Set simulation period
    date_pattern = re.compile(rf"{output_file}(\d{{4}}-\d{{2}}-\d{{2}})\.csv")

    existing_dates = []
    for file in output_dir.glob(f"{output_file}*.csv"):
        match = date_pattern.match(file.name)
        if match:
            existing_dates.append(datetime.strptime(match.group(1), "%Y-%m-%d").date())

    if existing_dates:
        latest_date = max(existing_dates)
        next_date = latest_date + timedelta(days=1)
        print(next_date)
    
    start_date = datetime.strptime(next_date.strftime('%Y-%m-%d'), '%Y-%m-%d')
    end_date = datetime.strptime((next_date + timedelta(days=1)).strftime('%Y-%m-%d'), '%Y-%m-%d')
    output_file += next_date.strftime('%Y-%m-%d') + ".csv"
    
    # Simulate every minute between start and end dates
    current_time = start_date
    
    while current_time < end_date:
        # Create a copy for this snapshot
        snapshot_df = df.copy()
        
        # Add simulation timestamp column
        snapshot_df["Simulation_Timestamp"] = current_time
        
        # Filter flights based on criteria:
        # 1. Planned departure is within 6 hours after simulation time
        # 2. Actual departure time > simulation time (OR hasn't happened yet)
        mask = (
            (snapshot_df["Planned_DT"] < current_time + timedelta(hours=6)) &
            (
                (snapshot_df["Actual_DT"] > current_time) | 
                (snapshot_df["Actual_DT"].isna())
            )
        )
        
        snapshot = snapshot_df[mask].copy()
        
        if not snapshot.empty:
            # Add Estimated_Delay column
            snapshot["Estimated_Delay"] = None
            
            for idx, row in snapshot.iterrows():
                planned = row["Planned_DT"]
                actual = row["Actual_DT"]
                
                # Calculate minutes to departure
                minutes_to_departure = (planned - current_time).total_seconds() / 60
                
                # Check if flight has already departed
                has_departed = not pd.isna(actual) and actual < current_time
                
                if has_departed:
                    # Flight already departed - use actual delay
                    snapshot.at[idx, "Estimated_Delay"] = row["Delay"]
                else:
                    # Flight hasn't departed yet - estimate delay
                    snapshot.at[idx, "Estimated_Delay"] = None
                    
                    # For flights that haven't departed, we need to simulate estimated delay
                    if minutes_to_departure > 0:  # Only for future flights
                        seed = abs(hash((row["Flight_Num"], planned.date()))) % (2**32)
                        rng = np.random.default_rng(seed)
                        
                        final_delay = row["Delay"]  # ground truth
                        
                        # Some flights reveal earlier / later
                        reveal_bias = rng.normal(0, 0.4)
                        
                        # Stable noise across snapshots
                        noise_rng = np.random.default_rng(seed + 1)
                        
                        est_delay = simulate_estimated_delay_backward(
                            minutes_to_departure=minutes_to_departure,
                            final_delay=final_delay,
                            reveal_bias=reveal_bias,
                            noise_rng=noise_rng
                        )
                        
                        snapshot.at[idx, "Estimated_Delay"] = round(est_delay, 0)
            
            # Drop helper columns before saving
            snapshot = snapshot.drop(columns=["Planned_DT", "Actual_DT"])
            
            # Add to list
            all_snapshots.append(snapshot)
        
        # Move to next minute
        current_time += timedelta(minutes=1)
        
        if current_time.hour == 0 and current_time.minute == 0:
            print(f"Processing: {current_time.strftime('%Y-%m-%d')}")
    
    # Combine all snapshots into a single DataFrame
    if all_snapshots:
        combined_df = pd.concat(all_snapshots, ignore_index=True)
        
        # Save to single CSV file
        filepath = os.path.join(output_dir, output_file)
        combined_df.to_csv(filepath, index=False)
        print(f"Saved all snapshots to: {filepath}")
        print(f"Total rows: {len(combined_df)}")
        print(f"Time range: {start_date} to {end_date}")
    else:
        print("No data to save")
    
    return combined_df if all_snapshots else None

In [None]:
df = pd.read_csv("C:\\Users\\Dell\\Documents\\studia\\Big Data MS\\flights_final.csv")

In [8]:
for i in range (20):
    save_daily_snapshots(df)

2024-11-03
Processing: 2024-11-04
Saved all snapshots to: data_historic\daily\flights_2024-11-03.csv
Total rows: 192907
Time range: 2024-11-03 00:00:00 to 2024-11-04 00:00:00
2024-11-04
Processing: 2024-11-05
Saved all snapshots to: data_historic\daily\flights_2024-11-04.csv
Total rows: 201183
Time range: 2024-11-04 00:00:00 to 2024-11-05 00:00:00
2024-11-05
Processing: 2024-11-06
Saved all snapshots to: data_historic\daily\flights_2024-11-05.csv
Total rows: 176880
Time range: 2024-11-05 00:00:00 to 2024-11-06 00:00:00
2024-11-06
Processing: 2024-11-07
Saved all snapshots to: data_historic\daily\flights_2024-11-06.csv
Total rows: 199352
Time range: 2024-11-06 00:00:00 to 2024-11-07 00:00:00
2024-11-07
Processing: 2024-11-08
Saved all snapshots to: data_historic\daily\flights_2024-11-07.csv
Total rows: 220071
Time range: 2024-11-07 00:00:00 to 2024-11-08 00:00:00
2024-11-08
Processing: 2024-11-09
Saved all snapshots to: data_historic\daily\flights_2024-11-08.csv
Total rows: 215856
Time 