In [None]:
import os
import glob
import time
import zipfile
import requests
import urllib3
import pandas as pd
import numpy as np

# =============================================================================
# CONFIGURATION
# =============================================================================
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Dynamic Paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, '..', 'data_raw')
OUTPUT_DIR = os.path.join(BASE_DIR, '..', 'data_processed')

# Ensure directories exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

YEAR = 2024
MONTHS = range(1, 13)

# =============================================================================
# STEP 1: INGESTION (BTS Download)
# =============================================================================
def download_flight_data():
    """Downloads monthly ZIPs from BTS with retry logic."""
    base_url = "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{}_{}.zip"
    print(f"\n--- [STEP 1] Starting Ingestion for {YEAR} ---")
    
    for month in MONTHS:
        filename = f"flights_{YEAR}_{month}.zip"
        file_path = os.path.join(DATA_DIR, filename)
        url = base_url.format(YEAR, month)
        
        if os.path.exists(file_path):
            print(f"Skipping {month}/{YEAR} (Exists)")
            continue

        print(f"Downloading {month}/{YEAR}...", end=" ")
        for attempt in range(3):
            try:
                time.sleep(3)
                r = requests.get(url, verify=False, stream=True, timeout=60)
                if r.status_code == 200:
                    with open(file_path, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=1024):
                            f.write(chunk)
                    print("Success.")
                    break
            except Exception:
                time.sleep(5)
                if attempt == 2: print("Failed.")

# =============================================================================
# STEP 2: PROCESSING & FINANCIAL MODELING
# =============================================================================
def process_and_model_data():
    """
    1. Extracts CSVs from ZIPs.
    2. cleans 'Mixed Type' columns.
    3. Calculates Financial Liability (Vectorized).
    4. Generates Star Schema (Fact + Dimensions).
    """
    print(f"\n--- [STEP 2] Processing & Financial Modeling ---")
    
    # A. Load Raw Data
    zip_files = glob.glob(os.path.join(DATA_DIR, f"flights_{YEAR}_*.zip"))
    if not zip_files: return
    
    dataframes = []
    for z_file in zip_files:
        with zipfile.ZipFile(z_file, 'r') as z:
            csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
            with z.open(csv_name) as f:
                df = pd.read_csv(f, low_memory=False)
                dataframes.append(df)
    
    full_df = pd.concat(dataframes, ignore_index=True)
    print(f"Raw Data Loaded: {len(full_df):,} rows.")

    # B. Clean Mixed Types (The "Zero-Fill" Strategy)
    text_cols = ['CancellationCode', 'Div1Airport', 'Div1TailNum', 'Div2Airport', 'Div2TailNum']
    for col in text_cols:
        if col in full_df.columns:
            full_df[col] = full_df[col].fillna("").astype(str)

    delay_causes = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
    full_df[delay_causes] = full_df[delay_causes].fillna(0.0)

    # C. Financial Modeling (Vectorized)
    print("Calculating Liability Metrics...")
    # Cost = $75/min for delays > 0
    full_df['Estimated_Cost'] = full_df['ArrDelay'].clip(lower=0).fillna(0) * 75
    
    # MRL = ($250/hr flight time) + ($180 per cycle)
    full_df['MRL_Liability'] = ((full_df['AirTime'].fillna(0) / 60) * 250) + (180)

    # D. Extract Dimensions (Star Schema)
    print("Extracting Dimensions...")
    
    # Dim: Airline
    dim_airlines = full_df[['Reporting_Airline', 'DOT_ID_Reporting_Airline', 'IATA_CODE_Reporting_Airline']].drop_duplicates()
    dim_airlines.to_parquet(os.path.join(OUTPUT_DIR, 'Aviation_Airline_Dim.parquet'))

    # Dim: Geography
    geo_origin = full_df[['Origin', 'OriginCityName', 'OriginStateName']].rename(
        columns={'Origin': 'AirportCode', 'OriginCityName': 'City', 'OriginStateName': 'State'})
    geo_dest = full_df[['Dest', 'DestCityName', 'DestStateName']].rename(
        columns={'Dest': 'AirportCode', 'DestCityName': 'City', 'DestStateName': 'State'})
    dim_geo = pd.concat([geo_origin, geo_dest]).drop_duplicates()
    dim_geo.to_parquet(os.path.join(OUTPUT_DIR, 'Aviation_Geo_Dim.parquet'))

    # E. Export Optimized Fact Table
    fact_cols = [
        'FlightDate', 'Tail_Number', 'Origin', 'Dest', 'Reporting_Airline',
        'DepDelay', 'ArrDelay', 'AirTime', 'Distance', 'Cancelled', 'Diverted',
        'Estimated_Cost', 'MRL_Liability', 
        'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'
    ]
    full_df[fact_cols].to_parquet(os.path.join(OUTPUT_DIR, 'Aviation_Fact_Table.parquet'), compression='snappy')
    
    print(f"SUCCESS: Fact Table & Dimensions Exported.")
    print(f"Total Portfolio Liability: ${full_df['MRL_Liability'].sum():,.2f}")

# =============================================================================
# STEP 3: ASSET AUDIT (Master Dim)
# =============================================================================
# =============================================================================
# STEP 3: ASSET AUDIT (Master Dim)
# =============================================================================
def build_master_audit():
    """Checks for Ghost Aircraft & Builds Master_Dim."""
    print(f"\n--- [STEP 3] Asset Audit ---")
    master_path = os.path.join(DATA_DIR, 'MASTER.txt')
    ref_path = os.path.join(DATA_DIR, 'ACFTREF.txt')
    
    if not os.path.exists(master_path): 
        print("CRITICAL: MASTER.txt not found. Skipping Step 3.")
        return

    # 1. Load Data
    master = pd.read_csv(master_path, low_memory=False, dtype=str)
    ref = pd.read_csv(ref_path, low_memory=False, dtype=str)
    fact_tails = pd.read_parquet(os.path.join(OUTPUT_DIR, 'Aviation_Fact_Table.parquet'), columns=['Tail_Number'])
    
    # 2. Standardize Logic
    def clean_n(val):
        s = str(val).strip().upper()
        return s if s.startswith('N') else f'N{s}'

    print("Standardizing Keys...")
    active_fleet = set(fact_tails['Tail_Number'].dropna().apply(clean_n))
    master['N-NUMBER_CLEAN'] = master['N-NUMBER'].apply(clean_n)
    
    # 3. Audit
    registered = set(master['N-NUMBER_CLEAN'])
    orphans = active_fleet - registered
    print(f"AUDIT RESULT: Found {len(orphans)} unregistered 'Ghost' aircraft.")
    
    # 4. Merge & Export (Preserving N-NUMBER name)
    master = pd.merge(master, ref, left_on='MFR MDL CODE', right_on='CODE', how='left')
    master = master[master['N-NUMBER_CLEAN'].isin(active_fleet)].copy()
    
    # CRITICAL FIX: We update N-NUMBER to be clean, but keep the column name 'N-NUMBER'
    # This ensures it matches the Fact table logic but keeps your original Schema name.
    master['N-NUMBER'] = master['N-NUMBER_CLEAN']
    
    cols = ['N-NUMBER', 'MFR', 'MODEL', 'YEAR MFR', 'TYPE AIRCRAFT']
    
    output_path = os.path.join(OUTPUT_DIR, 'Master_Dim.parquet')
    master[cols].to_parquet(output_path, index=False)
    print(f"SUCCESS: Master Dimension exported to {output_path}")
   
# =============================================================================
# MAIN
# =============================================================================
if __name__ == "__main__":
    # Uncomment to run full download:
    # download_flight_data() 
    
    process_and_model_data()
    build_master_audit()
