# Phase III: First ML Model Proof of Concept w/ Discussion of Ethical Considerations

In [None]:
import pandas as pd

In [None]:
flights_df = pd.read_parquet("cleaned_flights.parquet")

In [None]:
# Extract the hour from the 'DATE' column and create a new column 'DEPARTURE_HOUR'
flights_df['DEPARTURE_HOUR'] = flights_df['DATE'].dt.hour

# Filter the DataFrame to include only delayed flights
delayed_flights = flights_df[flights_df['ARRIVAL_DELAY'] > 0]
delayed_flights.head()

## Numpy ML Model

In [None]:
import numpy as np

In [None]:
# Standardize numeric columns including DISTANCE
for col in ['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE']:
    col_mean = delayed_flights[col].mean()
    col_std  = delayed_flights[col].std()
    delayed_flights[col] = (delayed_flights[col] - col_mean) / col_std

In [None]:
# Extract features: numeric and categorical features
numeric_feats = delayed_flights[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE']].values
categorical_feats = pd.get_dummies(delayed_flights[['AIRLINE', 'ORIGIN_AIRPORT']]).values
X = np.hstack([numeric_feats, categorical_feats])
y = delayed_flights['ARRIVAL_DELAY'].values

In [None]:
# Simple train/test split (e.g. 80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test,  y_test  = X[split_idx:], y[split_idx:]

In [None]:
# Add a column of 1s for the intercept
ones_train = np.ones((X_train.shape[0], 1))
X_train_design = np.hstack((ones_train, X_train))

ones_test = np.ones((X_test.shape[0], 1))
X_test_design = np.hstack((ones_test, X_test))

In [None]:
# Normal equation using pseudo-inverse to handle singular matrix: w = (XᵀX)⁺ Xᵀy
w = np.linalg.pinv(X_train_design.T @ X_train_design) @ (X_train_design.T @ y_train)

# Predictions
y_pred = X_test_design @ w

# Evaluate with MSE
mse = np.mean((y_test - y_pred)**2)
print("MSE:", mse)
print("Intercept:", w[0])
print("Coefficients (MONTH, DEPARTURE_HOUR, DAY_OF_WEEK):", w[1:])

# Compute R²
ss_res = np.sum((y_test - y_pred) ** 2)
ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)
r2 = 1 - (ss_res / ss_tot)
print("R²:", r2)

### Find optimal and least optimal flight features

In [None]:
# Add intercept
ones = np.ones((X.shape[0], 1))
X_design = np.hstack((ones, X))

# Generate predictions
y_pred = X_design @ w

# Find indices for min/max predicted delays
best_idx = np.argmin(y_pred)
worst_idx = np.argmax(y_pred)

# Retrieve flights
best_flight = delayed_flights.iloc[best_idx]
worst_flight = delayed_flights.iloc[worst_idx]

In [None]:
print("Best flight (lowest predicted delay):")
print(best_flight[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']])
print("Predicted delay (minutes):", y_pred[best_idx])

In [None]:
print("Worst flight (highest predicted delay):")
print(worst_flight[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']])
print("Predicted delay (minutes):", y_pred[worst_idx])