# Phase III: First ML Model Proof of Concept w/ Discussion of Ethical Considerations

In [1]:
import pandas as pd

In [2]:
flights_df = pd.read_parquet("cleaned_flights.parquet")

In [3]:
# Extract the hour from the 'DATE' column and create a new column 'DEPARTURE_HOUR'
flights_df['DEPARTURE_HOUR'] = flights_df['DATE'].dt.hour

# Filter the DataFrame to include only delayed flights
delayed_flights = flights_df[flights_df['ARRIVAL_DELAY'] > 0]
delayed_flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,origin_airport/LONGITUDE,destination_airport/AIRPORT,destination_airport/CITY,destination_airport/STATE,destination_airport/COUNTRY,destination_airport/LATITUDE,destination_airport/LONGITUDE,AIRLINE NAME,DATE,DEPARTURE_HOUR
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,-122.37484,Charlotte Douglas International Airport,Charlotte,NC,USA,35.21401,-80.94313,US Airways Inc.,2015-01-01 00:20:00,0
5,2015,1,1,4,DL,806,N3730B,SFO,MSP,25,...,-122.37484,Minneapolis-Saint Paul International Airport,Minneapolis,MN,USA,44.88055,-93.21692,Delta Air Lines Inc.,2015-01-01 00:25:00,0
14,2015,1,1,4,DL,2440,N651DL,SEA,MSP,40,...,-122.30931,Minneapolis-Saint Paul International Airport,Minneapolis,MN,USA,44.88055,-93.21692,Delta Air Lines Inc.,2015-01-01 00:40:00,0
20,2015,1,1,4,NK,520,N525NK,LAS,MCI,55,...,-115.15233,Kansas City International Airport,Kansas City,MO,USA,39.29761,-94.71391,Spirit Air Lines,2015-01-01 00:55:00,0
21,2015,1,1,4,AA,371,N3GXAA,SEA,MIA,100,...,-122.30931,Miami International Airport,Miami,FL,USA,25.79325,-80.29056,American Airlines Inc.,2015-01-01 01:00:00,1


## Numpy ML Model

In [4]:
import numpy as np

In [5]:
# Standardize the three columns in-place
for col in ['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']:
    col_mean = delayed_flights[col].mean()
    col_std  = delayed_flights[col].std()
    delayed_flights[col] = (delayed_flights[col] - col_mean) / col_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  delayed_flights[col] = (delayed_flights[col] - col_mean) / col_std


In [6]:
# Extract features (3 columns) and target
X = delayed_flights[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']].values
y = delayed_flights['ARRIVAL_DELAY'].values

In [7]:
# Simple train/test split (e.g. 80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test,  y_test  = X[split_idx:], y[split_idx:]

In [8]:
# Add a column of 1s for the intercept
ones_train = np.ones((X_train.shape[0], 1))
X_train_design = np.hstack((ones_train, X_train))

ones_test = np.ones((X_test.shape[0], 1))
X_test_design = np.hstack((ones_test, X_test))

In [None]:
# Normal equation: w = (XᵀX)⁻¹ Xᵀy
w = np.linalg.inv(X_train_design.T @ X_train_design) @ (X_train_design.T @ y_train)

# Predictions
y_pred = X_test_design @ w

# Evaluate with MSE
mse = np.mean((y_test - y_pred)**2)
print("MSE:", mse)
print("Intercept:", w[0])
print("Coefficients (MONTH, DEPARTURE_HOUR, DAY_OF_WEEK):", w[1:])

# Compute R²
ss_res = np.sum((y_test - y_pred) ** 2)
ss_tot = np.sum((y_test - np.mean(y_test)) ** 2)
r2 = 1 - (ss_res / ss_tot)
print("R²:", r2)

MSE: 3090.256435323303
Intercept: 33.012842161111024
Coefficients (MONTH, DEPARTURE_HOUR, DAY_OF_WEEK): [-0.70182906  3.61499703 -1.06713248]


### Find optimal and least optimal flight features

In [10]:
# Add intercept
ones = np.ones((X.shape[0], 1))
X_design = np.hstack((ones, X))

# Generate predictions
y_pred = X_design @ w

# Find indices for min/max predicted delays
best_idx = np.argmin(y_pred)
worst_idx = np.argmax(y_pred)

# Retrieve flights
best_flight = delayed_flights.iloc[best_idx]
worst_flight = delayed_flights.iloc[worst_idx]

In [11]:
print("Best flight (lowest predicted delay):")
print(best_flight[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']])
print("Predicted delay (minutes):", y_pred[best_idx])

Best flight (lowest predicted delay):
MONTH              1.66641
DEPARTURE_HOUR    -2.95381
DAY_OF_WEEK       1.573269
Name: 5416225, dtype: object
Predicted delay (minutes): 19.4864073638952


In [12]:
print("Worst flight (highest predicted delay):")
print(worst_flight[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK']])
print("Predicted delay (minutes):", y_pred[worst_idx])

Worst flight (highest predicted delay):
MONTH            -1.554436
DEPARTURE_HOUR    1.962947
DAY_OF_WEEK      -1.472853
Name: 78917, dtype: object
Predicted delay (minutes): 42.771567579892384
