## 1. Setup & Imports

In [2]:
# Data manipulation
import pandas as pd
import numpy as np
from pathlib import Path
import warnings

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Utilities
import pickle
from datetime import datetime

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
np.random.seed(42)

# Create output directory
OUTPUT_DIR = Path('processed_data')
OUTPUT_DIR.mkdir(exist_ok=True)

print("‚úÖ Setup complete!")

‚úÖ Setup complete!


## 2. Load Raw Datasets

In [3]:
DATA_DIR = Path('dataset')

print("Loading datasets...\n")

df_delays = pd.read_csv(DATA_DIR / 'Airline_Delay_Cause.csv')
df_pricing = pd.read_csv(DATA_DIR / 'airlines_flights_data.csv')
df_passengers = pd.read_csv(DATA_DIR / 'monthly_passengers.csv')
df_airports = pd.read_csv(DATA_DIR / 'airports.csv')
df_airlines = pd.read_csv(DATA_DIR / 'airlines.csv')
df_holidays = pd.read_csv(DATA_DIR / 'global_holidays.csv')
df_weather = pd.read_csv(DATA_DIR / 'GlobalWeatherRepository.csv')

print(f"‚úÖ Loaded {len(df_delays):,} delay records")
print(f"‚úÖ Loaded {len(df_pricing):,} pricing records")
print(f"‚úÖ Loaded {len(df_passengers):,} passenger records")
print(f"‚úÖ Loaded {len(df_airports):,} airports")
print(f"‚úÖ Loaded {len(df_holidays):,} holiday records")
print(f"‚úÖ Loaded {len(df_weather):,} weather records")

Loading datasets...

‚úÖ Loaded 171,666 delay records
‚úÖ Loaded 300,153 pricing records
‚úÖ Loaded 7,242 passenger records
‚úÖ Loaded 322 airports
‚úÖ Loaded 44,393 holiday records
‚úÖ Loaded 107,963 weather records
‚úÖ Loaded 171,666 delay records
‚úÖ Loaded 300,153 pricing records
‚úÖ Loaded 7,242 passenger records
‚úÖ Loaded 322 airports
‚úÖ Loaded 44,393 holiday records
‚úÖ Loaded 107,963 weather records


## 3. Project 1: Delay Prediction Dataset

### 3.1 Feature Engineering

In [4]:
print("=" * 80)
print("DELAY PREDICTION - FEATURE ENGINEERING")
print("=" * 80)

# Create a copy
df_delay_model = df_delays.copy()

# 1. Binary delay target (15+ minutes)
df_delay_model['is_delayed'] = (df_delay_model['arr_del15'] > 0).astype(int)

# 2. Delay rate feature
df_delay_model['delay_rate'] = (df_delay_model['arr_del15'] / df_delay_model['arr_flights']) * 100
df_delay_model['delay_rate'] = df_delay_model['delay_rate'].fillna(0)

# 3. Cancellation rate
df_delay_model['cancel_rate'] = (df_delay_model['arr_cancelled'] / df_delay_model['arr_flights']) * 100
df_delay_model['cancel_rate'] = df_delay_model['cancel_rate'].fillna(0)

# 4. Temporal features (cyclic encoding)
df_delay_model['month_sin'] = np.sin(2 * np.pi * df_delay_model['month'] / 12)
df_delay_model['month_cos'] = np.cos(2 * np.pi * df_delay_model['month'] / 12)

# 5. Dominant delay cause (multi-class target)
delay_cause_cols = ['carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']
df_delay_model['dominant_cause'] = df_delay_model[delay_cause_cols].idxmax(axis=1)
df_delay_model['dominant_cause'] = df_delay_model['dominant_cause'].str.replace('_ct', '')

# Handle rows with no delays (all zeros)
no_delays = df_delay_model[delay_cause_cols].sum(axis=1) == 0
df_delay_model.loc[no_delays, 'dominant_cause'] = 'none'

# 6. Historical carrier performance (rolling average)
df_delay_model = df_delay_model.sort_values(['carrier', 'year', 'month'])
df_delay_model['carrier_delay_history'] = df_delay_model.groupby('carrier')['delay_rate'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)

# 7. Historical airport performance
df_delay_model['airport_delay_history'] = df_delay_model.groupby('airport')['delay_rate'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)

print(f"\n‚úÖ Created {df_delay_model.shape[1]} features")
print(f"‚úÖ Binary delay distribution: {df_delay_model['is_delayed'].value_counts().to_dict()}")
print(f"‚úÖ Multi-class causes: {df_delay_model['dominant_cause'].value_counts().to_dict()}")

DELAY PREDICTION - FEATURE ENGINEERING

‚úÖ Created 29 features
‚úÖ Binary delay distribution: {1: 164638, 0: 7028}
‚úÖ Multi-class causes: {'carrier': 71816, 'late_aircraft': 52115, 'nas': 39106, 'none': 7028, 'weather': 1560, 'security': 41}

‚úÖ Created 29 features
‚úÖ Binary delay distribution: {1: 164638, 0: 7028}
‚úÖ Multi-class causes: {'carrier': 71816, 'late_aircraft': 52115, 'nas': 39106, 'none': 7028, 'weather': 1560, 'security': 41}


### 3.2 Handle Missing Values & Encoding

In [5]:
# Check missing values
print("\nüîç Missing Values Before Processing:")
missing = df_delay_model.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("‚úÖ No missing values!")

# Fill any remaining missing values
numeric_cols = df_delay_model.select_dtypes(include=[np.number]).columns
df_delay_model[numeric_cols] = df_delay_model[numeric_cols].fillna(0)

# Encode categorical features
le_carrier = LabelEncoder()
le_airport = LabelEncoder()
le_cause = LabelEncoder()

df_delay_model['carrier_encoded'] = le_carrier.fit_transform(df_delay_model['carrier'])
df_delay_model['airport_encoded'] = le_airport.fit_transform(df_delay_model['airport'])
df_delay_model['cause_encoded'] = le_cause.fit_transform(df_delay_model['dominant_cause'])

# Save encoders
with open(OUTPUT_DIR / 'delay_encoders.pkl', 'wb') as f:
    pickle.dump({
        'carrier': le_carrier,
        'airport': le_airport,
        'cause': le_cause
    }, f)

print("\n‚úÖ Encoding complete!")
print(f"  - Carriers: {len(le_carrier.classes_)} unique")
print(f"  - Airports: {len(le_airport.classes_)} unique")
print(f"  - Delay Causes: {le_cause.classes_.tolist()}")


üîç Missing Values Before Processing:
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircraft_ct       240
arr_cancelled          240
arr_diverted           240
arr_delay              240
carrier_delay          240
weather_delay          240
nas_delay              240
security_delay         240
late_aircraft_delay    240
dtype: int64

‚úÖ Encoding complete!
  - Carriers: 21 unique
  - Airports: 395 unique
  - Delay Causes: ['carrier', 'late_aircraft', 'nas', 'none', 'security', 'weather']


### 3.3 Train/Val/Test Split & Normalization

In [6]:
# Select features for modeling
feature_cols = [
    'year', 'month', 'month_sin', 'month_cos',
    'carrier_encoded', 'airport_encoded',
    'arr_flights', 'delay_rate', 'cancel_rate',
    'carrier_delay_history', 'airport_delay_history',
    'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct'
]

target_binary = 'is_delayed'
target_multiclass = 'cause_encoded'
target_regression = 'arr_delay'

# Create feature matrix
X = df_delay_model[feature_cols].copy()
y_binary = df_delay_model[target_binary].copy()
y_multiclass = df_delay_model[target_multiclass].copy()
y_regression = df_delay_model[target_regression].fillna(0).copy()

# Time-based split (use last 20% as test, middle 10% as validation)
n = len(X)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

X_train = X.iloc[:train_end]
X_val = X.iloc[train_end:val_end]
X_test = X.iloc[val_end:]

y_binary_train = y_binary.iloc[:train_end]
y_binary_val = y_binary.iloc[train_end:val_end]
y_binary_test = y_binary.iloc[val_end:]

y_multi_train = y_multiclass.iloc[:train_end]
y_multi_val = y_multiclass.iloc[train_end:val_end]
y_multi_test = y_multiclass.iloc[val_end:]

y_reg_train = y_regression.iloc[:train_end]
y_reg_val = y_regression.iloc[train_end:val_end]
y_reg_test = y_regression.iloc[val_end:]

print(f"\nüìä Split Sizes:")
print(f"  Train: {len(X_train):,} ({len(X_train)/n*100:.1f}%)")
print(f"  Val:   {len(X_val):,} ({len(X_val)/n*100:.1f}%)")
print(f"  Test:  {len(X_test):,} ({len(X_test)/n*100:.1f}%)")

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaler
with open(OUTPUT_DIR / 'delay_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("\n‚úÖ Normalization complete!")


üìä Split Sizes:
  Train: 120,166 (70.0%)
  Val:   25,750 (15.0%)
  Test:  25,750 (15.0%)

‚úÖ Normalization complete!


### 3.4 Save Processed Delay Dataset

In [7]:
# Save as numpy arrays for efficient loading
np.save(OUTPUT_DIR / 'delay_X_train.npy', X_train_scaled)
np.save(OUTPUT_DIR / 'delay_X_val.npy', X_val_scaled)
np.save(OUTPUT_DIR / 'delay_X_test.npy', X_test_scaled)

np.save(OUTPUT_DIR / 'delay_y_binary_train.npy', y_binary_train.values)
np.save(OUTPUT_DIR / 'delay_y_binary_val.npy', y_binary_val.values)
np.save(OUTPUT_DIR / 'delay_y_binary_test.npy', y_binary_test.values)

np.save(OUTPUT_DIR / 'delay_y_multi_train.npy', y_multi_train.values)
np.save(OUTPUT_DIR / 'delay_y_multi_val.npy', y_multi_val.values)
np.save(OUTPUT_DIR / 'delay_y_multi_test.npy', y_multi_test.values)

np.save(OUTPUT_DIR / 'delay_y_reg_train.npy', y_reg_train.values)
np.save(OUTPUT_DIR / 'delay_y_reg_val.npy', y_reg_val.values)
np.save(OUTPUT_DIR / 'delay_y_reg_test.npy', y_reg_test.values)

# Save feature names
with open(OUTPUT_DIR / 'delay_feature_names.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

print("\n‚úÖ Delay prediction dataset saved to processed_data/")
print(f"  - Features: {len(feature_cols)}")
print(f"  - Binary classes: {np.unique(y_binary_train).tolist()}")
print(f"  - Multi-classes: {len(le_cause.classes_)}")


‚úÖ Delay prediction dataset saved to processed_data/
  - Features: 16
  - Binary classes: [0, 1]
  - Multi-classes: 6


## 4. Project 2: Price Prediction Dataset

### 4.1 Feature Engineering

In [8]:
print("\n" + "=" * 80)
print("PRICE PREDICTION - FEATURE ENGINEERING")
print("=" * 80)

df_price_model = df_pricing.copy()

# 1. Extract flight number
df_price_model['flight_number'] = df_price_model['flight'].str.extract(r'(\d+)').astype(float)

# 2. Create route feature
df_price_model['route'] = df_price_model['source_city'] + '_' + df_price_model['destination_city']

# 3. Temporal encoding for departure time
time_mapping = {
    'Early_Morning': 0,
    'Morning': 1,
    'Afternoon': 2,
    'Evening': 3,
    'Night': 4,
    'Late_Night': 5
}
df_price_model['departure_time_encoded'] = df_price_model['departure_time'].map(time_mapping)
df_price_model['arrival_time_encoded'] = df_price_model['arrival_time'].map(time_mapping)

# 4. Cyclic encoding for departure time
df_price_model['departure_sin'] = np.sin(2 * np.pi * df_price_model['departure_time_encoded'] / 6)
df_price_model['departure_cos'] = np.cos(2 * np.pi * df_price_model['departure_time_encoded'] / 6)

# 5. Price per hour of duration
df_price_model['price_per_hour'] = df_price_model['price'] / df_price_model['duration']

# 6. Booking urgency (inverse of days_left)
df_price_model['urgency'] = 1 / (df_price_model['days_left'] + 1)

# 7. Stops encoding
stops_mapping = {'zero': 0, 'one': 1, 'two_or_more': 2}
df_price_model['stops_encoded'] = df_price_model['stops'].map(stops_mapping)

# 8. Class encoding
df_price_model['class_encoded'] = (df_price_model['class'] == 'Business').astype(int)

# 9. Average price by airline
airline_avg_price = df_price_model.groupby('airline')['price'].mean()
df_price_model['airline_avg_price'] = df_price_model['airline'].map(airline_avg_price)

# 10. Average price by route
route_avg_price = df_price_model.groupby('route')['price'].mean()
df_price_model['route_avg_price'] = df_price_model['route'].map(route_avg_price)

print(f"\n‚úÖ Created {df_price_model.shape[1]} features")
print(f"‚úÖ Price range: ‚Çπ{df_price_model['price'].min():,.0f} - ‚Çπ{df_price_model['price'].max():,.0f}")
print(f"‚úÖ Routes: {df_price_model['route'].nunique()} unique")


PRICE PREDICTION - FEATURE ENGINEERING

‚úÖ Created 24 features
‚úÖ Price range: ‚Çπ1,105 - ‚Çπ123,071
‚úÖ Routes: 30 unique

‚úÖ Created 24 features
‚úÖ Price range: ‚Çπ1,105 - ‚Çπ123,071
‚úÖ Routes: 30 unique


### 4.2 Encoding & Normalization

In [9]:
# Encode categorical variables
le_airline_price = LabelEncoder()
le_source = LabelEncoder()
le_dest = LabelEncoder()
le_route = LabelEncoder()

df_price_model['airline_encoded'] = le_airline_price.fit_transform(df_price_model['airline'])
df_price_model['source_encoded'] = le_source.fit_transform(df_price_model['source_city'])
df_price_model['dest_encoded'] = le_dest.fit_transform(df_price_model['destination_city'])
df_price_model['route_encoded'] = le_route.fit_transform(df_price_model['route'])

# Save encoders
with open(OUTPUT_DIR / 'price_encoders.pkl', 'wb') as f:
    pickle.dump({
        'airline': le_airline_price,
        'source': le_source,
        'destination': le_dest,
        'route': le_route
    }, f)

print("‚úÖ Encoding complete!")
print(f"  - Airlines: {len(le_airline_price.classes_)}")
print(f"  - Routes: {len(le_route.classes_)}")

‚úÖ Encoding complete!
  - Airlines: 6
  - Routes: 30


### 4.3 Train/Val/Test Split

In [10]:
# Select features
price_features = [
    'airline_encoded', 'source_encoded', 'dest_encoded', 'route_encoded',
    'departure_time_encoded', 'arrival_time_encoded',
    'departure_sin', 'departure_cos',
    'stops_encoded', 'class_encoded',
    'duration', 'days_left', 'urgency',
    'airline_avg_price', 'route_avg_price'
]

X_price = df_price_model[price_features].copy()
y_price = df_price_model['price'].copy()

# Random split (since no temporal component in this dataset)
X_price_train, X_price_temp, y_price_train, y_price_temp = train_test_split(
    X_price, y_price, test_size=0.3, random_state=42
)
X_price_val, X_price_test, y_price_val, y_price_test = train_test_split(
    X_price_temp, y_price_temp, test_size=0.5, random_state=42
)

print(f"\nüìä Split Sizes:")
print(f"  Train: {len(X_price_train):,} (70%)")
print(f"  Val:   {len(X_price_val):,} (15%)")
print(f"  Test:  {len(X_price_test):,} (15%)")

# Normalize features
scaler_price = StandardScaler()
X_price_train_scaled = scaler_price.fit_transform(X_price_train)
X_price_val_scaled = scaler_price.transform(X_price_val)
X_price_test_scaled = scaler_price.transform(X_price_test)

# Log-transform target (price is right-skewed)
y_price_train_log = np.log1p(y_price_train)
y_price_val_log = np.log1p(y_price_val)
y_price_test_log = np.log1p(y_price_test)

# Save scaler
with open(OUTPUT_DIR / 'price_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_price, f)

print("\n‚úÖ Normalization complete!")


üìä Split Sizes:
  Train: 210,107 (70%)
  Val:   45,023 (15%)
  Test:  45,023 (15%)

‚úÖ Normalization complete!


### 4.4 Save Processed Price Dataset

In [11]:
# Save arrays
np.save(OUTPUT_DIR / 'price_X_train.npy', X_price_train_scaled)
np.save(OUTPUT_DIR / 'price_X_val.npy', X_price_val_scaled)
np.save(OUTPUT_DIR / 'price_X_test.npy', X_price_test_scaled)

np.save(OUTPUT_DIR / 'price_y_train.npy', y_price_train.values)
np.save(OUTPUT_DIR / 'price_y_val.npy', y_price_val.values)
np.save(OUTPUT_DIR / 'price_y_test.npy', y_price_test.values)

np.save(OUTPUT_DIR / 'price_y_train_log.npy', y_price_train_log.values)
np.save(OUTPUT_DIR / 'price_y_val_log.npy', y_price_val_log.values)
np.save(OUTPUT_DIR / 'price_y_test_log.npy', y_price_test_log.values)

# Save feature names
with open(OUTPUT_DIR / 'price_feature_names.pkl', 'wb') as f:
    pickle.dump(price_features, f)

print("\n‚úÖ Price prediction dataset saved!")
print(f"  - Features: {len(price_features)}")
print(f"  - Price statistics (train):")
print(f"    Mean: ‚Çπ{y_price_train.mean():,.0f}")
print(f"    Std:  ‚Çπ{y_price_train.std():,.0f}")


‚úÖ Price prediction dataset saved!
  - Features: 15
  - Price statistics (train):
    Mean: ‚Çπ20,896
    Std:  ‚Çπ22,703


## 5. Project 3: Passenger Forecasting Dataset

### 5.1 Time Series Preparation

In [12]:
print("\n" + "=" * 80)
print("PASSENGER FORECASTING - TIME SERIES PREPARATION")
print("=" * 80)

df_passenger_model = df_passengers.copy()

# Use Total_OS column (most complete)
df_passenger_model = df_passenger_model[['ISO3', 'Year', 'Month', 'Total_OS']].copy()
df_passenger_model = df_passenger_model.dropna(subset=['Total_OS'])

# Create datetime
df_passenger_model['date'] = pd.to_datetime(
    df_passenger_model[['Year', 'Month']].assign(day=1)
)

# Sort by country and date
df_passenger_model = df_passenger_model.sort_values(['ISO3', 'date'])

# Create temporal features
df_passenger_model['month_sin'] = np.sin(2 * np.pi * df_passenger_model['Month'] / 12)
df_passenger_model['month_cos'] = np.cos(2 * np.pi * df_passenger_model['Month'] / 12)
df_passenger_model['year_normalized'] = (df_passenger_model['Year'] - df_passenger_model['Year'].min()) / \
                                         (df_passenger_model['Year'].max() - df_passenger_model['Year'].min())

# Create lag features (previous 1, 3, 6, 12 months)
for lag in [1, 3, 6, 12]:
    df_passenger_model[f'lag_{lag}'] = df_passenger_model.groupby('ISO3')['Total_OS'].shift(lag)

# Rolling statistics (3-month and 6-month windows)
for window in [3, 6]:
    df_passenger_model[f'rolling_mean_{window}'] = df_passenger_model.groupby('ISO3')['Total_OS'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )
    df_passenger_model[f'rolling_std_{window}'] = df_passenger_model.groupby('ISO3')['Total_OS'].transform(
        lambda x: x.rolling(window=window, min_periods=1).std()
    )

# Fill NaN in lag/rolling features with forward fill
df_passenger_model = df_passenger_model.fillna(method='ffill').fillna(0)

print(f"\n‚úÖ Time series features created")
print(f"  - Countries: {df_passenger_model['ISO3'].nunique()}")
print(f"  - Date range: {df_passenger_model['date'].min()} to {df_passenger_model['date'].max()}")
print(f"  - Total records: {len(df_passenger_model):,}")


PASSENGER FORECASTING - TIME SERIES PREPARATION

‚úÖ Time series features created
  - Countries: 89
  - Date range: 2010-01-01 00:00:00 to 2017-12-01 00:00:00
  - Total records: 6,594


### 5.2 Encode & Split

In [13]:
# Encode country
le_country = LabelEncoder()
df_passenger_model['country_encoded'] = le_country.fit_transform(df_passenger_model['ISO3'])

# Save encoder
with open(OUTPUT_DIR / 'passenger_encoders.pkl', 'wb') as f:
    pickle.dump({'country': le_country}, f)

# Select features
passenger_features = [
    'country_encoded', 'Year', 'Month',
    'month_sin', 'month_cos', 'year_normalized',
    'lag_1', 'lag_3', 'lag_6', 'lag_12',
    'rolling_mean_3', 'rolling_std_3',
    'rolling_mean_6', 'rolling_std_6'
]

X_passenger = df_passenger_model[passenger_features].copy()
y_passenger = df_passenger_model['Total_OS'].copy()

# Time-based split (70/15/15)
n = len(X_passenger)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

X_pass_train = X_passenger.iloc[:train_end]
X_pass_val = X_passenger.iloc[train_end:val_end]
X_pass_test = X_passenger.iloc[val_end:]

y_pass_train = y_passenger.iloc[:train_end]
y_pass_val = y_passenger.iloc[train_end:val_end]
y_pass_test = y_passenger.iloc[val_end:]

print(f"\nüìä Split Sizes:")
print(f"  Train: {len(X_pass_train):,}")
print(f"  Val:   {len(X_pass_val):,}")
print(f"  Test:  {len(X_pass_test):,}")

# Normalize
scaler_passenger = StandardScaler()
X_pass_train_scaled = scaler_passenger.fit_transform(X_pass_train)
X_pass_val_scaled = scaler_passenger.transform(X_pass_val)
X_pass_test_scaled = scaler_passenger.transform(X_pass_test)

# Save scaler
with open(OUTPUT_DIR / 'passenger_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_passenger, f)

print("\n‚úÖ Normalization complete!")


üìä Split Sizes:
  Train: 4,615
  Val:   989
  Test:  990

‚úÖ Normalization complete!


### 5.3 Save Passenger Dataset

In [14]:
# Save arrays
np.save(OUTPUT_DIR / 'passenger_X_train.npy', X_pass_train_scaled)
np.save(OUTPUT_DIR / 'passenger_X_val.npy', X_pass_val_scaled)
np.save(OUTPUT_DIR / 'passenger_X_test.npy', X_pass_test_scaled)

np.save(OUTPUT_DIR / 'passenger_y_train.npy', y_pass_train.values)
np.save(OUTPUT_DIR / 'passenger_y_val.npy', y_pass_val.values)
np.save(OUTPUT_DIR / 'passenger_y_test.npy', y_pass_test.values)

# Save feature names
with open(OUTPUT_DIR / 'passenger_feature_names.pkl', 'wb') as f:
    pickle.dump(passenger_features, f)

print("\n‚úÖ Passenger forecasting dataset saved!")
print(f"  - Features: {len(passenger_features)}")
print(f"  - Countries: {len(le_country.classes_)}")


‚úÖ Passenger forecasting dataset saved!
  - Features: 14
  - Countries: 89


## 6. Summary & Next Steps

In [15]:
print("\n" + "=" * 80)
print("‚úÖ PREPROCESSING COMPLETE!")
print("=" * 80)

print("\nüì¶ Saved Datasets:")
print("\n1. DELAY PREDICTION")
print(f"   - Binary classification (delayed/on-time)")
print(f"   - Multi-class classification (delay cause: {len(le_cause.classes_)} classes)")
print(f"   - Regression (delay minutes)")
print(f"   - Features: {len(feature_cols)}")
print(f"   - Train samples: {len(X_train):,}")

print("\n2. PRICE PREDICTION")
print(f"   - Regression task (predict ticket price)")
print(f"   - Features: {len(price_features)}")
print(f"   - Train samples: {len(X_price_train):,}")

print("\n3. PASSENGER FORECASTING")
print(f"   - Time series regression")
print(f"   - Features: {len(passenger_features)} (includes lags & rolling stats)")
print(f"   - Train samples: {len(X_pass_train):,}")

print("\nüöÄ Next Steps:")
print("   ‚Üí Notebook 03: Build delay prediction models (feedforward NN)")
print("   ‚Üí Notebook 04: Build price prediction models (DNN with embeddings)")
print("   ‚Üí Notebook 05: Advanced models (TabNet, LSTM, Transformers)")

print("\nüìÅ All processed data saved to: processed_data/")
saved_files = list(OUTPUT_DIR.glob('*.npy')) + list(OUTPUT_DIR.glob('*.pkl'))
print(f"   Total files: {len(saved_files)}")


‚úÖ PREPROCESSING COMPLETE!

üì¶ Saved Datasets:

1. DELAY PREDICTION
   - Binary classification (delayed/on-time)
   - Multi-class classification (delay cause: 6 classes)
   - Regression (delay minutes)
   - Features: 16
   - Train samples: 120,166

2. PRICE PREDICTION
   - Regression task (predict ticket price)
   - Features: 15
   - Train samples: 210,107

3. PASSENGER FORECASTING
   - Time series regression
   - Features: 14 (includes lags & rolling stats)
   - Train samples: 4,615

üöÄ Next Steps:
   ‚Üí Notebook 03: Build delay prediction models (feedforward NN)
   ‚Üí Notebook 04: Build price prediction models (DNN with embeddings)
   ‚Üí Notebook 05: Advanced models (TabNet, LSTM, Transformers)

üìÅ All processed data saved to: processed_data/
   Total files: 36
