# Flight Delay Prediction

## Data Loading and Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import cudf
import cupy as cp

In [3]:
# Load the cleaned flight data
flights_df = cudf.read_parquet("/content/drive/MyDrive/cleaned_flights.parquet")

In [4]:
# Extract the hour from the 'DATE' column and create a new column 'DEPARTURE_HOUR'
flights_df['DEPARTURE_HOUR'] = flights_df['DATE'].dt.hour

In [5]:
# Replace missing values in 'DAILY_SNOWFALL' with 0
flights_df['DAILY_SNOWFALL'] = flights_df['DAILY_SNOWFALL'].fillna(0)

In [6]:
# Select only delayed flights from flights_df
delayed_flights = flights_df[flights_df['ARRIVAL_DELAY'] > 0].copy()

## XGBoost Models

In [7]:
import xgboost as xgb
from cuml.metrics import mean_squared_error, r2_score

In [8]:
# Standardize numeric columns for sklearn models
for col in ['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE']:
    col_mean = delayed_flights[col].mean()
    col_std  = delayed_flights[col].std()
    delayed_flights[col] = (delayed_flights[col] - col_mean) / col_std

In [9]:
# Extract features for sklearn models
numeric_feats = delayed_flights[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE', 'DAILY_SNOWFALL']].astype(cp.float32).values
categorical_feats = cudf.get_dummies(delayed_flights[['AIRLINE', 'origin_airport/AIRPORT', 'destination_airport/AIRPORT']]).values
X = cp.hstack([numeric_feats, categorical_feats])
y = delayed_flights['ARRIVAL_DELAY'].values

In [10]:
split = int(0.8 * X.shape[0])
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [11]:
# Cell to replace RandomForestRegressor instantiation and training
# XGBoost with GPU acceleration
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',   # Specify the learning task
    tree_method='hist',             # Use GPU acceleration
    device= 'cuda',
    n_estimators=100,               # Number of boosting rounds
    max_depth=10,                   # Maximum tree depth (similar to RF)
    random_state=42,
    verbosity=3                     # Set verbosity level
)
xgb_model.fit(X_train, y_train)

In [12]:
# Cell to replace RandomForestRegressor prediction and evaluation
# Get predictions with XGBoost
xgb_predictions = xgb_model.predict(X_test)

# XGBoost predict returns a numpy array by default when input is cupy,
# convert it back to cupy array if needed for further cuML/cuPy operations
xgb_predictions_cp = cp.asarray(xgb_predictions)

# Print the MSE, R2, and RMSE (using the same cuML metrics)
# Note: Ensure y_test is a CuPy array if it isn't already.
# If y_test is a cuDF Series, use y_test.values
xgb_r2 = r2_score(y_test, xgb_predictions_cp)
xgb_mse = mean_squared_error(y_test, xgb_predictions_cp)
xgb_rmse = cp.sqrt(xgb_mse)
print(f"XGBoost test R2: {xgb_r2}, MSE: {xgb_mse}, RMSE: {xgb_rmse}")

XGBoost test R2: -0.016823429665511425, MSE: 3141.1812595849146, RMSE: 56.046242154000964
