In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import os

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# Load the new dataset
data_file = 'dataset/Combined_Flights_2024.csv'
print(f"Loading {data_file}...")

try:
    df = pd.read_csv(data_file)
    print(f"Data loaded. Total shape: {df.shape}")
    print(df.head())
except FileNotFoundError:
    print(f"Error: {data_file} not found. Please generate the dataset first.")

Loading dataset/Combined_Flights_2024.csv...


Data loaded. Total shape: (15000, 61)
   FlightDate            Airline Origin Dest  Cancelled  Diverted  CRSDepTime  \
0  2024-01-12     Japan Airlines    HKG  KUL          0         0        1557   
1  2024-08-15     Cathay Pacific    TGG  KUL          0         0         514   
2  2024-04-29     Cathay Pacific    BKK  KUL          0         0         930   
3  2024-10-18  Malaysia Airlines    JHB  PEN          0         0         723   
4  2024-05-13              Scoot    MEL  PEN          0         0        1139   

   DepTime  DepDelayMinutes  DepDelay  ...  WheelsOff  WheelsOn  TaxiIn  \
0     1557                0         0  ...          0         0      10   
1      514                0         0  ...          0         0      10   
2      949               19        19  ...          0         0      10   
3      723                0         0  ...          0         0      10   
4     1139                0         0  ...          0         0      10   

   CRSArrTime  ArrDelay 

In [3]:
from IPython.display import display

# 1. Define target and features
target = 'DepDel15'
features = [
    'Month',
    'DayOfWeek',
    'CRSDepTime',        # Scheduled Departure Time
    'Operating_Airline', # Unique Carrier Code
    'Origin',            # Origin Airport Code
    'Dest',              # Destination Airport Code
    'Distance'           # Distance
]

categorical_features = ['Operating_Airline', 'Origin', 'Dest']

# 2. Clean the data
print(f"Original size: {df.shape[0]} rows")
df = df.dropna(subset=features + [target])
print(f"Size after dropping missing values: {df.shape[0]} rows")

# 3. Prepare X and y
X = df[features].copy()
y = df[target]

# 4. Encode categorical features
encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value', 
    unknown_value=-1,
    dtype=int
)

X[categorical_features] = encoder.fit_transform(X[categorical_features])

print("Data prepared and encoded.")
display(X.head())

Original size: 15000 rows
Size after dropping missing values: 15000 rows
Data prepared and encoded.


Unnamed: 0,Month,DayOfWeek,CRSDepTime,Operating_Airline,Origin,Dest,Distance
0,1,5,1557,4,5,10,4048
1,8,4,514,1,20,10,1152
2,4,1,930,1,2,10,3504
3,10,5,723,5,7,16,1024
4,5,1,1139,9,13,16,2280


In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize the LightGBM Classifier
model = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=100,
    n_jobs=-1
)

# Train the model
print("Starting model training...")
model.fit(X_train, y_train, 
          categorical_feature=categorical_features
         )

print("Model training complete.")

Starting model training...
[LightGBM] [Info] Number of positive: 1405, number of negative: 10595
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.117083 -> initscore=-2.020345
[LightGBM] [Info] Start training from score -2.020345


Model training complete.


In [5]:
# Make predictions on the test set
print("Making predictions...")
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print("-" * 30)
print(f"Model Performance Metrics:")
print(f"Accuracy:  {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("-" * 30)

Making predictions...
------------------------------
Model Performance Metrics:
Accuracy:  87.70%
Precision: 0.0000
Recall:    0.0000
F1-Score:  0.0000
------------------------------


In [6]:
# Save the trained model and encoder
os.makedirs('api', exist_ok=True)
joblib.dump(model, 'api/flight_delay_model.joblib')
joblib.dump(encoder, 'api/flight_data_encoder.joblib')

print("Model and encoder saved to api/ folder.")

Model and encoder saved to api/ folder.
