# Lucas Gerbsch BQML Dive Notebook

In [1]:
# --- Minimal setup (edit 3 vars) ---
from google.colab import auth
auth.authenticate_user()

import os
from google.cloud import bigquery

PROJECT_ID = "mgmt-467-project-1"      # e.g., mgmt-467-47888
REGION     = "us-central1"
TABLE_PATH = "mgmt-467-project-1.flights.kaggle_flight_data"   # or your `bigquery-public-data.flights` table/view

os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"]     = REGION
bq = bigquery.Client(project=PROJECT_ID)

print("BQ Project:", PROJECT_ID)
print("Source table:", TABLE_PATH)


BQ Project: mgmt-467-project-1
Source table: mgmt-467-project-1.flights.kaggle_flight_data


In [2]:
preview_sql = f"SELECT * FROM `{TABLE_PATH}` LIMIT 5"
bq.query(preview_sql).result().to_dataframe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,1998,2,4,21,2,1998-04-21,DL,19790,DL,N334DL,...,,,,,,,,,,
1,1992,4,10,19,1,1992-10-19,DL,19790,DL,,...,,,,,,,,,,
2,2000,4,10,13,5,2000-10-13,DL,19790,DL,N225DL,...,,,,,,,,,,
3,1996,3,9,29,7,1996-09-29,DL,19790,DL,N331DL,...,,,,,,,,,,
4,1996,4,10,30,3,1996-10-30,DL,19790,DL,N236WA,...,,,,,,,,,,


# **Linear Regression Model**

# Task
Create a BigQuery `LINEAR_REG` model using `BQ.ML.CREATE_MODEL` to predict `ArrDelay` based on `DepDelay`, `Distance`, `Reporting_Airline`, `Origin`, `Dest`, and `DayOfWeek` from the `mgmt-467-project-1.flights.kaggle_flight_data` table.

## Prepare Training Data

### Subtask:
Construct a SQL query to select the necessary features and target variable ('ArrDelay') from the `kaggle_flight_data` table, handling any missing values or data types as needed for model training.


In [3]:
query = f"""SELECT ArrDelay, DepDelay, Distance, Reporting_Airline, Origin, Dest, DayOfWeek FROM `{TABLE_PATH}` WHERE ArrDelay IS NOT NULL AND DepDelay IS NOT NULL"""
df = bq.query(query).result().to_dataframe()
print(f"Shape of the prepared data: {df.shape}")
df.head()

Shape of the prepared data: (1958862, 7)


Unnamed: 0,ArrDelay,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek
0,101.0,103.0,692.0,EV,ATL,ABE,3
1,13.0,27.0,692.0,EV,ATL,ABE,6
2,-12.0,0.0,692.0,OH,ATL,ABE,4
3,-22.0,-6.0,692.0,EV,ATL,ABE,1
4,6.0,1.0,50.0,US,AVP,ABE,3


## Create LINEAR_REG Model

### Subtask:
Generate and execute a BigQuery SQL query to create a `LINEAR_REG` model using `BQ.ML.CREATE_MODEL`. The model will predict 'ArrDelay' based on the features: 'DepDelay', 'Distance', 'Reporting_Airline', 'Origin', 'Dest', and 'DayOfWeek'.


In [4]:
model_name = "bqml_arrdelay_model"
create_model_query = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name}`
OPTIONS(
    model_type='LINEAR_REG',
    input_label_cols=['ArrDelay']
)
AS
SELECT
    ArrDelay,
    DepDelay,
    Distance,
    Reporting_Airline,
    Origin,
    Dest,
    DayOfWeek
FROM
    `{TABLE_PATH}`
WHERE
    ArrDelay IS NOT NULL AND DepDelay IS NOT NULL
"""
bq.query(create_model_query).result()
print(f"BigQuery LINEAR_REG model '{model_name}' created successfully!")

BigQuery LINEAR_REG model 'bqml_arrdelay_model' created successfully!


### Evaluate the Model with ML.EVALUATE

In [5]:
evaluate_model_query = f"""SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name}`) """
evaluation_results = bq.query(evaluate_model_query).result().to_dataframe()
print("Model evaluation results:")
display(evaluation_results)

Model evaluation results:


Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,9.17431,568.936212,1.41345,6.467743,0.497171,0.497209


## **Mean Absolute Error in Business Terms**

MAE is the absolute average size of the mistakes the model makes in its' predictions. In the case of this specific model, a MAE means that the predicted arrival delay is, on average, 9.17 minutes off. This can go either way, so sometimes the predicted delay is 9.17 minutes over what the actual delay was and sometimes it's 9.17 minutes short of the true delay. This metric can be used to plan better between gate staff, logistics, and bag transport to create a more efficient airport workflow.   

# Task
Create a DataFrame with two hypothetical flight scenarios, including the features: `DepDelay`, `Distance`, `Reporting_Airline`, `Origin`, `Dest`, and `DayOfWeek`.

In [6]:
import pandas as pd

hypothetical_flights = [
    {
        'DepDelay': 30,
        'Distance': 500,
        'Reporting_Airline': 'AA',
        'Origin': 'LAX',
        'Dest': 'SFO',
        'DayOfWeek': 1  # Monday
    },
    {
        'DepDelay': -5,
        'Distance': 1200,
        'Reporting_Airline': 'DL',
        'Origin': 'JFK',
        'Dest': 'ORD',
        'DayOfWeek': 5  # Friday
    }
]

hypothetical_flights_df = pd.DataFrame(hypothetical_flights)
print("Hypothetical Flight Scenarios:")
display(hypothetical_flights_df)

Hypothetical Flight Scenarios:


Unnamed: 0,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek
0,30,500,AA,LAX,SFO,1
1,-5,1200,DL,JFK,ORD,5


## Generate and Execute ML.EXPLAIN_PREDICT

### Subtask:
Construct a BigQuery SQL query to use `ML.EXPLAIN_PREDICT` with the `bqml_arrdelay_model` and the hypothetical flight data. Execute this query and retrieve the results.


In [7]:
temp_table_id = f"{PROJECT_ID}.flights.temp_hypothetical_flights_for_explain_predict"
job = bq.load_table_from_dataframe(hypothetical_flights_df, temp_table_id)
job.result()  # Wait for the job to complete
print(f"DataFrame uploaded to BigQuery table: {temp_table_id}")

DataFrame uploaded to BigQuery table: mgmt-467-project-1.flights.temp_hypothetical_flights_for_explain_predict


In [8]:
explain_predict_query = f"""
SELECT *
FROM ML.EXPLAIN_PREDICT(MODEL `{PROJECT_ID}.flights.{model_name}`,
  (SELECT * FROM `{temp_table_id}`))
"""

explanation_results = bq.query(explain_predict_query).result().to_dataframe()
print("ML.EXPLAIN_PREDICT results:")
display(explanation_results)

ML.EXPLAIN_PREDICT results:


Unnamed: 0,predicted_ArrDelay,top_feature_attributions,baseline_prediction_value,prediction_value,approximation_error,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek
0,28.966968,"[{'feature': 'Dest', 'attribution': -1073.9499...",1610.177017,28.966968,0.0,30,500,AA,LAX,SFO,1
1,-8.192872,"[{'feature': 'Dest', 'attribution': -1073.3393...",1610.177017,-8.192872,0.0,-5,1200,DL,JFK,ORD,5


## Interpret Top Features

### Subtask:
Analyze the results from `ML.EXPLAIN_PREDICT` to identify and interpret the top features contributing to the predicted arrival delays for each hypothetical flight.


In [10]:
import json

print("Interpretation of ML.EXPLAIN_PREDICT results:\n")

for index, row in explanation_results.iterrows():
    predicted_delay = row['predicted_ArrDelay']
    original_dep_delay = hypothetical_flights_df.loc[index, 'DepDelay']
    original_distance = hypothetical_flights_df.loc[index, 'Distance']
    original_airline = hypothetical_flights_df.loc[index, 'Reporting_Airline']
    original_origin = hypothetical_flights_df.loc[index, 'Origin']
    original_dest = hypothetical_flights_df.loc[index, 'Dest']
    original_dayofweek = hypothetical_flights_df.loc[index, 'DayOfWeek']

    print(f"--- Hypothetical Flight Scenario {index + 1} ---")
    print(f"Original Input:")
    print(f"  DepDelay: {original_dep_delay} minutes")
    print(f"  Distance: {original_distance} miles")
    print(f"  Reporting_Airline: {original_airline}")
    print(f"  Origin: {original_origin}")
    print(f"  Dest: {original_dest}")
    print(f"  DayOfWeek: {original_dayofweek}")
    print(f"Predicted Arrival Delay: {predicted_delay:.2f} minutes\n")

    # The top_feature_attributions column is already parsed into a list of dictionaries
    attributions = row['top_feature_attributions']

    # Sort attributions by absolute value to find the most influential features
    sorted_attributions = sorted(attributions, key=lambda x: abs(x['attribution']), reverse=True)

    print("Top Feature Attributions:")
    for i, attr in enumerate(sorted_attributions):
        feature = attr['feature']
        attribution_score = attr['attribution']
        print(f"  {i+1}. {feature}: {attribution_score:.2f}")
    print("\n")

Interpretation of ML.EXPLAIN_PREDICT results:

--- Hypothetical Flight Scenario 1 ---
Original Input:
  DepDelay: 30 minutes
  Distance: 500 miles
  Reporting_Airline: AA
  Origin: LAX
  Dest: SFO
  DayOfWeek: 1
Predicted Arrival Delay: 28.97 minutes

Top Feature Attributions:
  1. Dest: -1073.95
  2. Origin: -879.00
  3. Reporting_Airline: 349.74
  4. DepDelay: 21.00
  5. Distance: 0.52


--- Hypothetical Flight Scenario 2 ---
Original Input:
  DepDelay: -5 minutes
  Distance: 1200 miles
  Reporting_Airline: DL
  Origin: JFK
  Dest: ORD
  DayOfWeek: 5
Predicted Arrival Delay: -8.19 minutes

Top Feature Attributions:
  1. Dest: -1073.34
  2. Origin: -880.35
  3. Reporting_Airline: 349.73
  4. DepDelay: -13.21
  5. Distance: -1.03




# **Classification**

# Task
Create a BigQuery `LOGISTIC_REG` model using `BQ.ML.CREATE_MODEL` to predict `Diverted` (a boolean indicating if a flight was diverted) based on `DepDelay`, `Distance`, `Reporting_Airline`, `Origin`, `Dest`, and `DayOfWeek` from the `mgmt-467-project-1.flights.kaggle_flight_data` table.

## Prepare Training Data


In [11]:
query_diverted = f"""SELECT CAST(Diverted AS INT64) AS Diverted, DepDelay, Distance, Reporting_Airline, Origin, Dest, DayOfWeek FROM `{TABLE_PATH}` WHERE Diverted IS NOT NULL AND DepDelay IS NOT NULL"""
df_diverted = bq.query(query_diverted).result().to_dataframe()
print(f"Shape of the prepared data for Diverted: {df_diverted.shape}")
df_diverted.head()

Shape of the prepared data for Diverted: (1963932, 7)


Unnamed: 0,Diverted,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek
0,0,-8.0,692.0,DL,ABE,ATL,7
1,0,0.0,692.0,EA,ABE,ATL,3
2,0,-6.0,692.0,EV,ABE,ATL,7
3,0,0.0,692.0,EV,ABE,ATL,7
4,0,-4.0,339.0,XE,ABE,CLE,7


## Create LOGISTIC_REG Model


In [12]:
model_name_diverted = "bqml_diverted_model"
create_model_query_diverted = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name_diverted}`
OPTIONS(
    model_type='LOGISTIC_REG',
    input_label_cols=['Diverted']
)
AS
SELECT
    CAST(Diverted AS INT64) AS Diverted,
    DepDelay,
    Distance,
    Reporting_Airline,
    Origin,
    Dest,
    DayOfWeek
FROM
    `{TABLE_PATH}`
WHERE
    Diverted IS NOT NULL AND DepDelay IS NOT NULL
"""
bq.query(create_model_query_diverted).result()
print(f"BigQuery LOGISTIC_REG model '{model_name_diverted}' created successfully!")

BigQuery LOGISTIC_REG model 'bqml_diverted_model' created successfully!


### Evaluate the LOGISTIC_REG Model with ML.EVALUATE

In [13]:
evaluate_diverted_model_query = f"""SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name_diverted}`) """
evaluation_results_diverted = bq.query(evaluate_diverted_model_query).result().to_dataframe()
print("LOGISTIC_REG Model Evaluation Results:")
display(evaluation_results_diverted)

LOGISTIC_REG Model Evaluation Results:


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.0,0.0,0.998115,0.0,0.013002,0.820762


# **Adding Class Weights**

After looking at the confusion matrix and seeing the precision and recall values, I realized that something was off with my model. I realized that I never properly weighted my classes and that flight diversions must be a rare enough class that it's being under-represented and almost always missed. As a result, I decided to retrain the model with class weights.

# Task
Re-create the `LOGISTIC_REG` model with `AUTO_CLASS_WEIGHTS=TRUE` using `BQ.ML.CREATE_MODEL` to account for class imbalance, and then evaluate this new model to display its classification metrics.

## Re-create LOGISTIC_REG Model with Class Weights



In [14]:
model_name_diverted_weighted = "bqml_diverted_model_weighted"
create_model_query_diverted_weighted = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name_diverted_weighted}`
OPTIONS(
    model_type='LOGISTIC_REG',
    input_label_cols=['Diverted'],
    AUTO_CLASS_WEIGHTS=TRUE
)
AS
SELECT
    CAST(Diverted AS INT64) AS Diverted,
    DepDelay,
    Distance,
    Reporting_Airline,
    Origin,
    Dest,
    DayOfWeek
FROM
    `{TABLE_PATH}`
WHERE
    Diverted IS NOT NULL AND DepDelay IS NOT NULL
"""
bq.query(create_model_query_diverted_weighted).result()
print(f"BigQuery LOGISTIC_REG model '{model_name_diverted_weighted}' created successfully with class weights!")

BigQuery LOGISTIC_REG model 'bqml_diverted_model_weighted' created successfully with class weights!


In [15]:
evaluate_diverted_model_weighted_query = f"""SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name_diverted_weighted}`) """
evaluation_results_diverted_weighted = bq.query(evaluate_diverted_model_weighted_query).result().to_dataframe()
print("LOGISTIC_REG Model (with class weights) Evaluation Results:")
display(evaluation_results_diverted_weighted)

LOGISTIC_REG Model (with class weights) Evaluation Results:


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.004432,0.947368,0.59869,0.008824,0.693004,0.837579


## Interpret Tuned LOGISTIC_REG Model

### Subtask:
Analyze the evaluation results of the tuned model, comparing the new precision, recall, and other metrics to the previous model's performance, specifically noting any improvements for the 'Diverted' class due to `AUTO_CLASS_WEIGHTS`.


### Comparison of LOGISTIC_REG Model Performance (with and without `AUTO_CLASS_WEIGHTS`)

**Original Model (`bqml_diverted_model`):**
```
precision:  0.0
recall:     0.0
accuracy:   0.998115
f1_score:   0.0
log_loss:   0.013002
roc_auc:    0.820762
```

**Weighted Model (`bqml_diverted_model_weighted` with `AUTO_CLASS_WEIGHTS=TRUE`):**
```
precision:  0.004432
recall:     0.947368
accuracy:   0.59869
f1_score:   0.008824
log_loss:   0.693004
roc_auc:    0.837579
```

### Interpretation:
Since flight diversions are the extremely rare class the original model had an extremely high accuracy, but misses basically all diverted flights. The high accuracy is misleading because it's failing to capture our intended target at all. On the other hand, the weighted model catches almost all diverted flights but causes lots of false alarms.


### **Custom Threshold Rescoring (0.75)**

The assignment specifies to try a different threshold, so I used 0.75 as instructed here.

In [22]:
predict_with_threshold_original_query = f"""SELECT
    Diverted, -- Actual Diverted status
    DepDelay,
    Distance,
    Reporting_Airline,
    Origin,
    Dest,
    DayOfWeek,
    predicted_Diverted_probs,
    (SELECT prob FROM UNNEST(predicted_Diverted_probs) WHERE label = 1) AS probability_of_diverted,
    CASE
        WHEN (SELECT prob FROM UNNEST(predicted_Diverted_probs) WHERE label = 1) >= 0.75 THEN 1
        ELSE 0
    END AS custom_predicted_diverted
FROM ML.PREDICT(MODEL `{PROJECT_ID}.flights.{model_name_diverted}`,
  (SELECT
      CAST(Diverted AS INT64) AS Diverted,
      DepDelay,
      Distance,
      Reporting_Airline,
      Origin,
      Dest,
      DayOfWeek
  FROM
      `{TABLE_PATH}`
  WHERE
      Diverted IS NOT NULL AND DepDelay IS NOT NULL
  ))
"""

predictions_with_threshold_original_df = bq.query(predict_with_threshold_original_query).result().to_dataframe()
print("Predictions with custom threshold (0.75) for original model generated:")
display(predictions_with_threshold_original_df.head())

Predictions with custom threshold (0.75) for original model generated:


Unnamed: 0,Diverted,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek,predicted_Diverted_probs,probability_of_diverted,custom_predicted_diverted
0,0,61.0,692.0,EV,ATL,ABE,5,"[{'label': 1, 'prob': 0.002503823752033253}, {...",0.002504,0
1,0,-4.0,50.0,US,AVP,ABE,7,"[{'label': 1, 'prob': 0.001846150253096114}, {...",0.001846,0
2,0,32.0,481.0,US,CLT,ABE,7,"[{'label': 1, 'prob': 0.001993907255912106}, {...",0.001994,0
3,0,4.0,481.0,US,CLT,ABE,5,"[{'label': 1, 'prob': 0.0017626235731957444}, ...",0.001763,0
4,0,2.0,481.0,OH,CLT,ABE,1,"[{'label': 1, 'prob': 0.0018602431995923317}, ...",0.00186,0


## Evaluate Model with Custom Threshold (Original Model)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Extract actual and custom predicted labels for the original model
actual_labels_original = predictions_with_threshold_original_df['Diverted']
predicted_labels_custom_threshold_original = predictions_with_threshold_original_df['custom_predicted_diverted']

# Calculate metrics
accuracy_original = accuracy_score(actual_labels_original, predicted_labels_custom_threshold_original)
conf_matrix_original = confusion_matrix(actual_labels_original, predicted_labels_custom_threshold_original)
class_report_original = classification_report(actual_labels_original, predicted_labels_custom_threshold_original, output_dict=True, zero_division=0)

print(f"Accuracy for Original Model with custom threshold (0.75): {accuracy_original:.4f}")
print("\nConfusion Matrix for Original Model (Actual vs. Predicted with custom threshold):\n", conf_matrix_original)
print("\nClassification Report for Original Model (custom threshold 0.75):\n")

# Print formatted classification report
print(f"{'':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}{'Support':<10}")
for label, metrics in class_report_original.items():
    if label.isdigit(): # For class 0 and 1
        print(f"{label:<10}{metrics['precision']:<10.4f}{metrics['recall']:<10.4f}{metrics['f1-score']:<10.4f}{metrics['support']:<10}")
    elif label in ['accuracy', 'macro avg', 'weighted avg']:
        if label == 'accuracy': # Accuracy is already printed, skip here to avoid redundancy
            continue
        print(f"{label:<10}{metrics['precision']:<10.4f}{metrics['recall']:<10.4f}{metrics['f1-score']:<10.4f}{metrics['support']:<10}")

Accuracy for Original Model with custom threshold (0.75): 0.9977

Confusion Matrix for Original Model (Actual vs. Predicted with custom threshold):
 [[1959338       4]
 [   4590       0]]

Classification Report for Original Model (custom threshold 0.75):

          Precision Recall    F1-Score  Support   
macro avg 0.4988    0.5000    0.4994    1963932.0 
weighted avg0.9953    0.9977    0.9965    1963932.0 


### Interpretation and Justification of Custom Threshold (0.75) for the Original Model

**Evaluation Results (Custom Threshold 0.75, Original Model):**

```
Accuracy: 0.9977

Confusion Matrix:
 [[1959338       4]
 [   4590       0]]

Classification Report:
          Precision Recall    F1-Score  Support   
macro avg 0.4988    0.5000    0.4994    1963932.0
weighted avg0.9953    0.9977    0.9965    1963932.0
```

**Components of the Confusion Matrix:**
*   **True Negatives (TN):** 1,959,338 (Correctly predicted as NOT diverted)
*   **False Positives (FP):** 4 (Incorrectly predicted as diverted)
*   **False Negatives (FN):** 4,590 (Actual diversions that were NOT predicted)
*   **True Positives (TP):** 0 (Correctly predicted as diverted)

**Comparison with Default Threshold (from `ML.EVALUATE` of Original Model):**

**Original Model (`bqml_diverted_model`, default threshold):**
```
precision:  0.0
recall:     0.0
accuracy:   0.998115
f1_score:   0.0
log_loss:   0.013002
roc_auc:    0.820762
```

# **Analysis of Changes with Custom Threshold (0.75) for Original Model:**

Our original model still fails to capture diverted flights even with the new threshold. This means our model is making very confident wrong predictions when a diversion is present, leading to potentially large business implications like high rescheduling costs and flight delays. This stands in contrast to my weighted model which catches the diversions, but at the cost of lots of false alarms. In this situation, I believe using the weighted model is a far superior decision. Having lots of false alarms will cost some money and efficiency, but being overprepared isn't a bad thing. The alternative is not being prepared for a diversion and having to spend exponentially more rushing to accomodate diverted passengers and paying fees to expedite the processes that being overprepared would've prevented.   

# **Feature Engineering**

# Task
Create a BigQuery `LOGISTIC_REG` model using `BQ.ML.CREATE_MODEL` that includes feature transformations within the `AS SELECT` statement, predicting `Diverted` based on a new `route` feature (origin-dest), the existing `DayOfWeek` feature, and a bucketized `DepDelay` feature ('early', 'on-time', 'minor', 'moderate', 'major'). Ensure `AUTO_CLASS_WEIGHTS` is not enabled, and then evaluate the model's performance using `ML.EVALUATE`.

## Define Feature Transformation Logic

### Subtask:
Define the SQL logic for creating the `route` feature, extracting `day_of_week` from `FlightDate`, and bucketizing `DepDelay` into 'early', 'on-time', 'minor', 'moderate', and 'major' categories.


In [24]:
feature_engineering_sql_logic = """
    CONCAT(Origin, '-', Dest) AS route,
    DayOfWeek,
    CASE
        WHEN DepDelay < 0 THEN 'early'
        WHEN DepDelay = 0 THEN 'on-time'
        WHEN DepDelay > 0 AND DepDelay <= 15 THEN 'minor'
        WHEN DepDelay > 15 AND DepDelay <= 60 THEN 'moderate'
        WHEN DepDelay > 60 THEN 'major'
        ELSE 'unknown' -- Or handle NULLs explicitly in WHERE clause
    END AS bucketized_depdelay
"""

print("Feature engineering SQL logic defined.")

Feature engineering SQL logic defined.


In [26]:
model_name_diverted_transformed = "bqml_diverted_model_transformed"
create_model_query_diverted_transformed = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name_diverted_transformed}`
OPTIONS(
    model_type='LOGISTIC_REG',
    input_label_cols=['Diverted']
)
AS
SELECT
    CAST(Diverted AS INT64) AS Diverted,
    {feature_engineering_sql_logic}
FROM
    `{TABLE_PATH}`
WHERE
    Diverted IS NOT NULL AND DepDelay IS NOT NULL
"""
bq.query(create_model_query_diverted_transformed).result()
print(f"BigQuery LOGISTIC_REG model '{model_name_diverted_transformed}' created successfully with feature engineering!")

BigQuery LOGISTIC_REG model 'bqml_diverted_model_transformed' created successfully with feature engineering!


**Reasoning**:
Now that the model with feature engineering has been successfully created, the next step is to evaluate its performance using `ML.EVALUATE` to display its classification metrics.



In [27]:
evaluate_diverted_model_transformed_query = f"""SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name_diverted_transformed}`) """
evaluation_results_diverted_transformed = bq.query(evaluate_diverted_model_transformed_query).result().to_dataframe()
print("LOGISTIC_REG Model (with feature engineering) Evaluation Results:")
display(evaluation_results_diverted_transformed)

LOGISTIC_REG Model (with feature engineering) Evaluation Results:


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.004213,0.714286,0.64635,0.008378,0.692323,0.722672


## Interpret Transformed Model Results

### Subtask:
Analyze the evaluation results of the transformed model, comparing its precision, recall, and other metrics to the initial original model and the weighted model, to understand the impact of transformations alone on the model's ability to predict diverted flights.


# **Comparison of LOGISTIC_REG Model Performance with Feature Engineering**

**Original Model (`bqml_diverted_model`):**
```
precision:  0.0
recall:     0.0
accuracy:   0.998115
f1_score:   0.0
log_loss:   0.013002
roc_auc:    0.820762
```

**Weighted Model (`bqml_diverted_model_weighted` with `AUTO_CLASS_WEIGHTS=TRUE`):**
```
precision:  0.004432
recall:     0.947368
accuracy:   0.59869
f1_score:   0.008824
log_loss:   0.693004
roc_auc:    0.837579
```

**Transformed Model (`bqml_diverted_model_transformed` with Feature Engineering):**
```
precision:  0.004213
recall:     0.714286
accuracy:   0.64635
f1_score:   0.008378
log_loss:   0.692323
roc_auc:    0.722672
```

# **Interpretation:**

The transformed model does a significantly better job of identifying diverted flights compared to the base model, but still falls short of my individually created weighted model. The transformed model has a lower accuracy compared to the original model as well, but this is to be expected when we're trading it for increased recall. False positives are also very prevelant in the transformed model. Overall, the model does a far better job of capturing the rare class and is an improvement, but I believe my own weighted model serves the business better in terms of reducing risks and costs associated with diversions.

# **Cost & Scale**

# Task
Explain the benefits of using a `LIMIT` clause for faster model development (sampling) compared to training on the full dataset for a final model.

## Explain LIMIT vs. Full Data Approach

### Subtask:
Provide a markdown explanation of why and when to use a `LIMIT` clause for faster development iterations (sampling) versus training on the full dataset for a final model.


## Create Sampled Model (LIMIT)

### Subtask:
Construct and execute a BigQuery SQL query to create a `LOGISTIC_REG` model with the previously defined feature engineering (route, DayOfWeek, bucketized_depdelay). This model will use a `LIMIT` clause (e.g., LIMIT 100000) on the input data to simulate a development iteration and will NOT use `AUTO_CLASS_WEIGHTS`.


In [28]:
model_name_diverted_sampled = "bqml_diverted_model_sampled"
create_model_query_diverted_sampled = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name_diverted_sampled}`
OPTIONS(
    model_type='LOGISTIC_REG',
    input_label_cols=['Diverted']
)
AS
SELECT
    CAST(Diverted AS INT64) AS Diverted,
    {feature_engineering_sql_logic}
FROM
    `{TABLE_PATH}`
WHERE
    Diverted IS NOT NULL AND DepDelay IS NOT NULL
LIMIT 100000
"""
bq.query(create_model_query_diverted_sampled).result()
print(f"BigQuery LOGISTIC_REG model '{model_name_diverted_sampled}' created successfully with feature engineering and sampling!")

BigQuery LOGISTIC_REG model 'bqml_diverted_model_sampled' created successfully with feature engineering and sampling!


## Evaluate Sampled Model

### Subtask:
Generate and execute a BigQuery SQL query to evaluate the performance of the sampled `LOGISTIC_REG` model using `ML.EVALUATE`. Retrieve and display its classification metrics.


In [29]:
evaluate_diverted_model_sampled_query = f"SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name_diverted_sampled}`) "
evaluation_results_diverted_sampled = bq.query(evaluate_diverted_model_sampled_query).result().to_dataframe()
print("LOGISTIC_REG Model (with feature engineering and sampling) Evaluation Results:")
display(evaluation_results_diverted_sampled)

LOGISTIC_REG Model (with feature engineering and sampling) Evaluation Results:


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.0,0.0,0.997507,0.0,0.018218,0.585222


## Compare Sampled vs. Full Data Model

### Subtask:
Compare the evaluation results of the newly created sampled model with the previously trained 'Transformed Model' (which was trained on the full dataset without class weights). Discuss the differences in performance metrics, training time, and resource usage, explaining the trade-offs.


### Comparison of Sampled Model vs. Full Data Transformed Model

**Sampled Model (`bqml_diverted_model_sampled` with `LIMIT 100000` and Feature Engineering):**
```
precision:  0.0
recall:     0.0
accuracy:   0.997507
f1_score:   0.0
log_loss:   0.018218
roc_auc:    0.585222
```

**Full Data Transformed Model (`bqml_diverted_model_transformed` with Feature Engineering, NO `AUTO_CLASS_WEIGHTS`):**
```
precision:  0.004213
recall:     0.714286
accuracy:   0.64635
f1_score:   0.008378
log_loss:   0.692323
roc_auc:    0.722672
```

# **Discussion of Differences and Trade-offs:**

The main difference between the sample and the full run is that the sample fails to capture the flight diversions. Due to the sample only containing a fraction of the data and the diversion class being extremely rare, it displays that there is not enough of the rare class present in the sample to properly train the model to catch it. It displays how sampling is a viable tactic when there is there is a solid amount of the target class present within the dataset that the sample can contain, but very poor when it's extremely rare. Doing full runs is far better for these types of problems and will yield better results for the business.