In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
import joblib

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# List all the parquet files you want to load
data_files = [
    'dataset/Combined_Flights_2018.parquet',
    'dataset/Combined_Flights_2019.parquet',
    'dataset/Combined_Flights_2020.parquet',
    'dataset/Combined_Flights_2021.parquet',
    'dataset/Combined_Flights_2022.parquet'
]

# --- IMPORTANT WARNING ---
# This is a LOT of data. This step might take a few minutes
# and use a lot of your computer's RAM.
# If your computer crashes, try it with just ONE file first:
# data_files = ['dataset/Combined_Flights_2022.parquet']

# Loop through each file, load it, and add it to a list
dataframes_list = []
for file in data_files:
    print(f"Loading {file}...")
    df = pd.read_parquet(file)
    dataframes_list.append(df)

# Combine all DataFrames into one single, massive DataFrame
print("Combining all datasets...")
df = pd.concat(dataframes_list, ignore_index=True)

print(f"Data loaded. Total shape: {df.shape}")
print(df.head())

Loading dataset/Combined_Flights_2018.parquet...
Loading dataset/Combined_Flights_2019.parquet...
Loading dataset/Combined_Flights_2020.parquet...
Loading dataset/Combined_Flights_2021.parquet...
Loading dataset/Combined_Flights_2022.parquet...
Combining all datasets...
Data loaded. Total shape: (29193782, 61)
  FlightDate            Airline Origin Dest  Cancelled  Diverted  CRSDepTime  \
0 2018-01-23  Endeavor Air Inc.    ABY  ATL      False     False        1202   
1 2018-01-24  Endeavor Air Inc.    ABY  ATL      False     False        1202   
2 2018-01-25  Endeavor Air Inc.    ABY  ATL      False     False        1202   
3 2018-01-26  Endeavor Air Inc.    ABY  ATL      False     False        1202   
4 2018-01-27  Endeavor Air Inc.    ABY  ATL      False     False        1400   

   DepTime  DepDelayMinutes  DepDelay  ...  WheelsOff  WheelsOn  TaxiIn  \
0   1157.0              0.0      -5.0  ...     1211.0    1249.0     7.0   
1   1157.0              0.0      -5.0  ...     1210.0    

In [3]:
# --- New Cell 3 (with fixes) ---

from IPython.display import display # Add this to make nice tables

# 1. Define your target (what we want to predict)
target = 'DepDel15'

# 2. Define your features (the inputs for the model)
features = [
    'Month',
    'DayOfWeek',
    'CRSDepTime',        # Scheduled Departure Time
    'Operating_Airline', # Your data dictionary calls this "Unique Carrier Code"
    'Origin',            # Origin Airport Code
    'Dest',              # Destination Airport Code
    'Distance'           # Let's add distance as a feature
]

# 3. Define which of those features are categorical
categorical_features = ['Operating_Airline', 'Origin', 'Dest']

# 4. Clean the data
print(f"Original size: {df.shape[0]} rows")
# Drop any rows where our key features or target are missing
df = df.dropna(subset=features + [target])
print(f"Size after dropping missing values: {df.shape[0]} rows")

# 5. Create our X (features) and y (target)
# --- FIX 1: Add .copy() to make the warning go away ---
X = df[features].copy() 
y = df[target]
# ---------------------------------------------------

# 6. Create and fit the encoder
encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value', 
    unknown_value=-1,
    dtype=int
)

# Now we modify our new copy, which is perfectly safe.
X[categorical_features] = encoder.fit_transform(X[categorical_features])

print("Data prepared and encoded.")

# --- FIX 2: Use display() for a clean HTML table output ---
display(X.head())

Original size: 29193782 rows
Size after dropping missing values: 28430698 rows
Data prepared and encoded.


Unnamed: 0,Month,DayOfWeek,CRSDepTime,Operating_Airline,Origin,Dest,Distance
0,1,2,1202,0,4,23,145.0
1,1,3,1202,0,4,23,145.0
2,1,4,1202,0,4,23,145.0
3,1,5,1202,0,4,23,145.0
4,1,6,1400,0,4,23,145.0


In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize the LightGBM Classifier
model = lgb.LGBMClassifier(
    objective='binary',  # We are predicting 0 or 1 (binary)
    n_estimators=100,
    n_jobs=-1            # Use all available CPU cores
)

# Train the model!
print("Starting model training... (this may take a few minutes)")
model.fit(X_train, y_train, 
          categorical_feature=categorical_features  # Pass the correct feature names
         )

print("Model training complete.")

Starting model training... (this may take a few minutes)
[LightGBM] [Info] Number of positive: 3930370, number of negative: 18814188
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.322069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 22744558, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.172805 -> initscore=-1.565878
[LightGBM] [Info] Start training from score -1.565878
Model training complete.


In [5]:
# Make predictions on the test set
print("Making predictions...")
y_pred = model.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Making predictions...
Model Accuracy: 82.82%


In [6]:
# Save the trained model to a file
joblib.dump(model, 'api/flight_delay_model.joblib')

# Save the encoder to a file
joblib.dump(encoder, 'api/flight_data_encoder.joblib')

print("Model and encoder saved to api/ folder.")

Model and encoder saved to api/ folder.
