In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
import joblib

print("Libraries imported successfully.")

In [None]:
# List all the parquet files you want to load
data_files = [
    'dataset/Combined_Flights_2018.parquet',
    'dataset/Combined_Flights_2019.parquet',
    'dataset/Combined_Flights_2020.parquet',
    'dataset/Combined_Flights_2021.parquet',
    'dataset/Combined_Flights_2022.parquet'
]

# --- IMPORTANT WARNING ---
# This is a LOT of data. This step might take a few minutes
# and use a lot of your computer's RAM.
# If your computer crashes, try it with just ONE file first:
# data_files = ['dataset/Combined_Flights_2022.parquet']

# Loop through each file, load it, and add it to a list
dataframes_list = []
for file in data_files:
    print(f"Loading {file}...")
    df = pd.read_parquet(file)
    dataframes_list.append(df)

# Combine all DataFrames into one single, massive DataFrame
print("Combining all datasets...")
df = pd.concat(dataframes_list, ignore_index=True)

print(f"Data loaded. Total shape: {df.shape}")
print(df.head())

In [None]:
# 1. Define your target (what we want to predict)
#    From your file: "DepDel15" is "Departure Delay Indicator, 15 Minutes or More (1=Yes)"
target = 'DepDel15'

# 2. Define your features (the inputs for the model)
#    Using names from your RECORD LAYOUT file:
features = [
    'Month',
    'DayOfWeek',
    'CRSDepTime',        # Scheduled Departure Time
    'Operating_Airline', # Unique airline code
    'Origin',            # Origin Airport Code
    'Dest'               # Destination Airport Code
]

# 3. Define which of those features are categorical (text-based)
categorical_features = ['Operating_Airline', 'Origin', 'Dest']

# 4. Clean the data
#    - Drop rows where our target or features are missing
print(f"Original size: {df.shape}")
df = df.dropna(subset=features + [target])
print(f"Size after dropping missing values: {df.shape}")

# 5. Create our X (features) and y (target)
X = df[features]
y = df[target]

# 6. We must convert categories to numbers. An Encoder does this.
#    We use OrdinalEncoder because LightGBM is smart enough to handle it.
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_features] = encoder.fit_transform(X[categorical_features])

print("Data prepared and encoded.")
X.head()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize the LightGBM Classifier
model = lgb.LGBMClassifier(
    objective='binary',  # We are predicting 0 or 1 (binary)
    n_estimators=100,
    n_jobs=-1            # Use all available CPU cores
)

# Train the model!
print("Starting model training... (this may take a few minutes)")
model.fit(X_train, y_train, 
          categorical_feature=categorical_features  # Pass the correct feature names
         )

print("Model training complete.")

In [None]:
# Make predictions on the test set
print("Making predictions...")
y_pred = model.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Save the trained model to a file
joblib.dump(model, 'api/flight_delay_model.joblib')

# Save the encoder to a file
joblib.dump(encoder, 'api/flight_data_encoder.joblib')

print("Model and encoder saved to api/ folder.")