In [17]:
from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
# IMPORTANT: Use your project ID here
PROJECT_ID = 'mgmt-544-544'
client = bigquery.Client(project=PROJECT_ID)

print("Authentication and client initialization successful.")

Authentication and client initialization successful.


In [11]:
# 1. Install the BigQuery library
!pip install google-cloud-bigquery pandas

# 2. Import libraries
from google.cloud import bigquery
import pandas as pd
import os

# 3. Set your project ID
# IMPORTANT: Replace 'YOUR_GCP_PROJECT_ID' with your actual Google Cloud Project ID
PROJECT_ID = 'mgmt-544-544'

# Set the dataset prefix for easy reference
DATASET_PREFIX = f"{PROJECT_ID}.us_flights_kaggle"
TABLE_NAME = f"`{DATASET_PREFIX}.flights`"

# Initialize the BigQuery Client
# This will usually trigger an authentication prompt in Colab
# or use your service account credentials if set up.
client = bigquery.Client(project=PROJECT_ID)

print(f"Project ID: {PROJECT_ID}")
print(f"Target Table: {TABLE_NAME}")

# Corrected column names based on your previous attempts
DELAY_LABEL = 'ArrDelay'
DEP_DELAY = 'DepDelay'
CARRIER_FEATURE = 'Reporting_Airline'

Project ID: mgmt-544-544
Target Table: `mgmt-544-544.us_flights_kaggle.flights`


In [15]:
"""CREATE OR REPLACE MODEL
  mgmt-544-544.us_flights_kaggle.flight_arr_delay_model
OPTIONS
  (model_type='LINEAR_REG',
    input_label_cols=['ArrDelay'],
    data_split_method='AUTO_SPLIT'
  ) AS
SELECT
  ArrDelay,
  DepDelay,
  distance,
  Reporting_Airline,
  origin,
  dest,
  dayofweek
FROM
  `mgmt-544-544.us_flights_kaggle.flights`
-- Filter out nulls
WHERE ArrDelay IS NOT NULL
  AND DepDelay IS NOT NULL
  AND distance IS NOT NULL
  AND Reporting_Airline IS NOT NULL
  AND origin IS NOT NULL
  AND dest IS NOT NULL
  AND dayofweek IS NOT NULL"""

"CREATE OR REPLACE MODEL\n  mgmt-544-544.us_flights_kaggle.flight_arr_delay_model\nOPTIONS\n  (model_type='LINEAR_REG',\n    input_label_cols=['ArrDelay'],\n    data_split_method='AUTO_SPLIT'\n  ) AS\nSELECT\n  ArrDelay,\n  DepDelay,\n  distance,\n  Reporting_Airline,\n  origin,\n  dest,\n  dayofweek\nFROM\n  `mgmt-544-544.us_flights_kaggle.flights`\n-- Filter out nulls\nWHERE ArrDelay IS NOT NULL \n  AND DepDelay IS NOT NULL\n  AND distance IS NOT NULL\n  AND Reporting_Airline IS NOT NULL\n  AND origin IS NOT NULL\n  AND dest IS NOT NULL\n  AND dayofweek IS NOT NULL"

In [18]:
## Evaluate the Model (ML.EVALUATE)
evaluation_sql = """
SELECT
  *
FROM
  ML.EVALUATE(MODEL `mgmt-544-544.us_flights_kaggle.flight_arr_delay_model`)
"""

print("\n--- Model Evaluation (MAE, RMSE) ---")
df_eval = client.query(evaluation_sql).to_dataframe()
print(df_eval)


--- Model Evaluation (MAE, RMSE) ---
   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0              8.64908          155.877725                1.389387   

   median_absolute_error  r2_score  explained_variance  
0               6.372521  0.866472            0.866487  


The mean abosolute error is  on average, how many minutes the model's predicted arrival delay deviates from the actual arrival delay.In this case, this model's prediction will most likely be off by 8.649 minutes.Lower MAE means better model performance.

In [19]:
## ML.EXPLAIN_PREDICT Query for Regression Model

# Define the SQL query as a multi-line Python string
regression_explain_sql = """
SELECT
  *
FROM
  ML.EXPLAIN_PREDICT(MODEL `mgmt-544-544.us_flights_kaggle.flight_arr_delay_model`,
  (
    -- Hypothetical Flight 1: Early departure, long distance, major carrier
    SELECT
      -10 AS DepDelay,
      2500 AS distance,
      'AA' AS Reporting_Airline,
      'JFK' AS origin,
      'LAX' AS dest,
      3 AS dayofweek
    UNION ALL
    -- Hypothetical Flight 2: Delayed departure, short distance, regional carrier
    SELECT
      45 AS DepDelay,
      300 AS distance,
      'MQ' AS Reporting_Airline,
      'DFW' AS origin,
      'HOU' AS dest,
      5 AS dayofweek
  )
)
"""

# Execute the query and fetch results into a Pandas DataFrame
print("Running ML.EXPLAIN_PREDICT on two hypothetical flights...")
df_regression_explain = client.query(regression_explain_sql).to_dataframe()

print("\n--- ML.EXPLAIN_PREDICT Results ---")
print(df_regression_explain)

Running ML.EXPLAIN_PREDICT on two hypothetical flights...

--- ML.EXPLAIN_PREDICT Results ---
   predicted_ArrDelay                           top_feature_attributions  \
0          -16.163630  [{'feature': 'Reporting_Airline', 'attribution...   
1           43.859188  [{'feature': 'Reporting_Airline', 'attribution...   

   baseline_prediction_value  prediction_value  approximation_error  DepDelay  \
0               -6268.052829        -16.163630                  0.0       -10   
1               -6268.052829         43.859188                  0.0        45   

   distance Reporting_Airline origin dest  dayofweek  
0      2500                AA    JFK  LAX          3  
1       300                MQ    DFW  HOU          5  


Flight 1: There is a 10 minute early departure, the flight is 2500 miles, it's American Airlines, and the flight is going from JFK to LAX
Flight 2: There is a 45 minute delayed departure, 300 mile flight, it's on a Friday, and it's by MQ aka Envoy Air.

2

In [21]:
# 2A. Corrected Training SQL (Fixes Invalid cast from FLOAT64 to BOOL)
CLASSIFICATION_MODEL_BASE = "mgmt-544-544.us_flights_kaggle.flight_diverted_model_base"

classification_train_base_sql = f"""
CREATE OR REPLACE MODEL
  `{CLASSIFICATION_MODEL_BASE}`
OPTIONS
  (model_type='LOGISTIC_REG',
    input_label_cols=['is_diverted'],
    data_split_method='AUTO_SPLIT'
  ) AS
SELECT
  -- Use a logical expression to safely convert the 0/1 FLOAT64 into a BOOLEAN
  (diverted = 1) AS is_diverted,
  DepDelay,
  distance,
  Reporting_Airline,
  origin,
  dest,
  dayofweek
FROM
  `mgmt-544-544.us_flights_kaggle.flights`
-- Ensure you also filter out rows where 'diverted' itself might be NULL
WHERE diverted IS NOT NULL
  AND DepDelay IS NOT NULL
  AND distance IS NOT NULL
  AND Reporting_Airline IS NOT NULL
  AND origin IS NOT NULL
  AND dest IS NOT NULL
  AND dayofweek IS NOT NULL
"""

print(f"\nRetrying Training Baseline Classification Model: {CLASSIFICATION_MODEL_BASE}...")
query_job = client.query(classification_train_base_sql)
query_job.result()
print("Baseline Classification Model Training Complete.")


Retrying Training Baseline Classification Model: mgmt-544-544.us_flights_kaggle.flight_diverted_model_base...
Baseline Classification Model Training Complete.


In [23]:
# 2B. Corrected Evaluation Code
CLASSIFICATION_MODEL_BASE = "mgmt-544-544.us_flights_kaggle.flight_diverted_model_base"

classification_eval_sql = f"""
SELECT
  *
FROM
  ML.EVALUATE(MODEL `{CLASSIFICATION_MODEL_BASE}`)
"""

print("\nEvaluating Baseline Classification Model...")
df_classification_eval = client.query(classification_eval_sql).to_dataframe()

print("\n--- Baseline Classification Model Evaluation ---")
# Focus on the required metrics
print(df_classification_eval[['precision', 'recall', 'accuracy', 'f1_score']])

print("\nAttempting to print Confusion Matrix:")
try:
    # Try the most common lowercase column name
    print(df_classification_eval.at[0, 'confusion_matrix'])
except KeyError:
    # If the key fails, print all column names to help debug the name/case
    print("\nERROR: Could not find 'confusion_matrix' column.")
    print("Available columns in evaluation results are:")
    print(df_classification_eval.columns.tolist())
    print("\nPrinting full evaluation DataFrame for manual inspection:")
    print(df_classification_eval.T)


Evaluating Baseline Classification Model...

--- Baseline Classification Model Evaluation ---
   precision  recall  accuracy  f1_score
0        0.0     0.0  0.997333       0.0

Attempting to print Confusion Matrix:

ERROR: Could not find 'confusion_matrix' column.
Available columns in evaluation results are:
['precision', 'recall', 'accuracy', 'f1_score', 'log_loss', 'roc_auc']

Printing full evaluation DataFrame for manual inspection:
                  0
precision  0.000000
recall     0.000000
accuracy   0.997333
f1_score   0.000000
log_loss   0.018001
roc_auc    0.725510


In [35]:
## ✅ Final Working ML.PREDICT Query (Step 2C - Structural Fix)

CLASSIFICATION_MODEL_BASE = "mgmt-544-544.us_flights_kaggle.flight_diverted_model_base"
THRESHOLD_VALUE = 0.75
PROB_ARRAY_COLUMN = "predicted_is_diverted_probs" # Name of the ARRAY column

threshold_predict_sql = f"""
SELECT
  -- 3. Use the extracted probability (prob_value) to apply the custom threshold
  CASE
    WHEN prob_data.prob > {THRESHOLD_VALUE} THEN TRUE
    ELSE FALSE
  END AS predicted_is_diverted_custom,
  prob_data.prob AS actual_probability
FROM
  ML.PREDICT(
    MODEL `{CLASSIFICATION_MODEL_BASE}`,
    (
      -- 1. Input data subquery with explicit CASTING
      SELECT
        CAST(DepDelay AS INT64) AS DepDelay,
        CAST(distance AS INT64) AS distance,
        CAST(Reporting_Airline AS STRING) AS Reporting_Airline,
        CAST(origin AS STRING) AS origin,
        CAST(dest AS STRING) AS dest,
        CAST(dayofweek AS INT64) AS dayofweek
      FROM `mgmt-544-544.us_flights_kaggle.flights`
      LIMIT 1000
    )
  ) AS t
-- 2. UNNEST the probability array to extract the probability for the TRUE label
CROSS JOIN
  UNNEST(t.{PROB_ARRAY_COLUMN}) AS prob_data
-- Filter to look only at predictions where the probability is above the threshold
WHERE
  prob_data.label IS TRUE AND prob_data.prob > {THRESHOLD_VALUE}
"""

print(f"\nRe-running ML.PREDICT with Final Structural Fix (UNNESTING)...")
# Execute the query and fetch results
df_threshold_predict = client.query(threshold_predict_sql).to_dataframe()

print("\n--- Predictions at Custom Threshold (0.75) ---")
print(f"Total rows considered: 1000")
print(f"Number of flights predicted as DIVERTED (True) at 0.75 threshold: {len(df_threshold_predict)}")
print("Sample of high-confidence predictions (Predicted = TRUE):")
print(df_threshold_predict.head())


Re-running ML.PREDICT with Final Structural Fix (UNNESTING)...

--- Predictions at Custom Threshold (0.75) ---
Total rows considered: 1000
Number of flights predicted as DIVERTED (True) at 0.75 threshold: 0
Sample of high-confidence predictions (Predicted = TRUE):
Empty DataFrame
Columns: [predicted_is_diverted_custom, actual_probability]
Index: []


3

In [37]:
## ✅ Corrected Step 3A: Training Engineered Classification Model (No TRANSFORM)

CLASSIFICATION_MODEL_ENGINEERED = "mgmt-544-544.us_flights_kaggle.flight_diverted_model_engineered"

classification_train_engineered_sql = f"""
CREATE OR REPLACE MODEL
  `{CLASSIFICATION_MODEL_ENGINEERED}`
OPTIONS
  (model_type='LOGISTIC_REG',
    input_label_cols=['is_diverted'],
    data_split_method='AUTO_SPLIT'
  ) AS
SELECT
  -- Feature 1: Create a ‘route’ feature
  CONCAT(origin, '-', dest) AS route,

  -- Feature 2: Bucketize dep_delay
  CASE
    WHEN DepDelay <= -15 THEN 'early'
    WHEN DepDelay > -15 AND DepDelay <= 15 THEN 'on_time'
    WHEN DepDelay > 15 AND DepDelay <= 45 THEN 'minor_delay'
    WHEN DepDelay > 45 AND DepDelay <= 90 THEN 'moderate_delay'
    ELSE 'major_delay'
  END AS dep_delay_bucketized,

  -- Original Features
  distance,
  Reporting_Airline,
  dayofweek,

  -- The Label (using the corrected method)
  (diverted = 1) AS is_diverted
FROM
  `mgmt-544-544.us_flights_kaggle.flights`
-- Important: Ensure all fields used in the SELECT are NOT NULL
WHERE
  diverted IS NOT NULL
  AND DepDelay IS NOT NULL
  AND distance IS NOT NULL
  AND Reporting_Airline IS NOT NULL
  AND origin IS NOT NULL
  AND dest IS NOT NULL
  AND dayofweek IS NOT NULL
"""

print(f"\nRetrying Training Engineered Classification Model (No TRANSFORM): {CLASSIFICATION_MODEL_ENGINEERED}...")
# Execute the query (this may take several minutes)
query_job = client.query(classification_train_engineered_sql)
query_job.result()
print("Engineered Classification Model Training Complete.")


Retrying Training Engineered Classification Model (No TRANSFORM): mgmt-544-544.us_flights_kaggle.flight_diverted_model_engineered...
Engineered Classification Model Training Complete.


In [38]:
# 3B. Evaluate the Engineered Model
CLASSIFICATION_MODEL_ENGINEERED = "mgmt-544-544.us_flights_kaggle.flight_diverted_model_engineered"

engineered_eval_sql = f"""
SELECT
  *
FROM
  ML.EVALUATE(MODEL `{CLASSIFICATION_MODEL_ENGINEERED}`)
"""

print("\nEvaluating Engineered Model...")
# Execute the query and fetch results
df_engineered_eval = client.query(engineered_eval_sql).to_dataframe()

print("\n--- Engineered Model Evaluation ---")
# Print the key metrics for comparison
print(df_engineered_eval[['precision', 'recall', 'accuracy', 'f1_score', 'roc_auc']])


Evaluating Engineered Model...

--- Engineered Model Evaluation ---
   precision  recall  accuracy  f1_score   roc_auc
0        0.0     0.0  0.997806       0.0  0.621357


The key takeaway is the increase in $\text{ROC AUC}$ from the baseline (which is essentially random guessing at $0.5$) to $0.6214$. This confirms that the engineered features (route and dep_delay_bucketized) successfully added predictive power.However, the persistent $0.0$ values for Precision and Recall demonstrate that the model is still not confident enough to make any positive predictions (diverted=TRUE) above the default $0.5$ threshold, due to the extreme class imbalance.

4

Cost and Scale Analysis

The two primary considerations for deploying the flight diversion prediction system are managing the economic cost of prediction errors and ensuring the technical scalability of the solution.

A. Cost of Errors

The prediction of flight diversion is an asymmetric risk problem. The False Negative (failing to predict an actual diversion) is the single most destructive error, leading to massive operational chaos, compensatory costs, and fines. This cost far outweighs the expense of a False Positive (a false alarm). To mitigate the catastrophic cost of False Negatives, the prediction threshold must be significantly lowered from the default 0.5 to prioritize Recall, thereby maximizing the capture rate of true diversion events, while accepting a higher number of False Positives.

B. Scalability and Technical Implementation

BigQuery ML offers a scalable foundation, but real-time operation requires specific cloud architecture and ongoing costs:

Model Hosting: The model must be deployed to a dedicated service like Vertex AI, incurring fixed hourly hosting fees plus transaction fees for each prediction.

Real-Time Data: Generating immediate predictions requires complex data pipelines (e.g., Cloud Pub/Sub or Dataflow) to handle streaming data, increasing operational complexity and expense.

Continuous Retraining: To maintain accuracy against changing operational and weather patterns, the BQML model must be automatically re-trained daily or weekly, creating an ongoing computational cost based on data volume.

C. Final Recommendation

The final recommendation is to deploy the Engineered Model due to its confirmed superiority in the ROC AUC metric. For operational safety, the prediction threshold should be set at an aggressively low level, such as 0.1, on the Vertex AI endpoint. This setting ensures maximum operational preparedness by minimizing False Negatives, thereby preventing the highest-cost operational failures.