In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mexwell/carrier-on-time-performance-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'carrier-on-time-performance-dataset' dataset.
Path to dataset files: /kaggle/input/carrier-on-time-performance-dataset


In [None]:
import os
from google.cloud import bigquery
from google.colab import auth

auth.authenticate_user()

PROJECT_ID = "sapient-office-471119-g4"      # e.g., mgmt-467-47888
REGION     = "us-central1" # Updated to match the location of unit2_flights_kaggle
TABLE_PATH = "sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle"   # Updated to use the Kaggle data in BQ

os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"]     = REGION
bq = bigquery.Client(project=PROJECT_ID, location=REGION)

# Ensure the dataset for models exists, explicitly setting location via client API
MODEL_DATASET_ID = "unit2_flights"
model_dataset_ref = bq.dataset(MODEL_DATASET_ID)
model_dataset = bigquery.Dataset(model_dataset_ref)
model_dataset.location = REGION

try:
    bq.get_dataset(model_dataset)  # Check if dataset exists
    print(f"Schema '{PROJECT_ID}.{MODEL_DATASET_ID}' already exists in {REGION}.")
except Exception:
    bq.create_dataset(model_dataset, timeout=30)  # Create dataset
    print(f"Schema '{PROJECT_ID}.{MODEL_DATASET_ID}' created in {REGION}.")

print("BQ Project:", PROJECT_ID)
print("Source table:", TABLE_PATH)

Schema 'sapient-office-471119-g4.unit2_flights' already exists in us-central1.
BQ Project: sapient-office-471119-g4
Source table: sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle


In [None]:
import os

# List all files and directories in the Kaggle download path
print(f"Contents of {path}:")
for root, dirs, files in os.walk(path):
    for name in files:
        print(os.path.join(root, name))
    for name in dirs:
        print(os.path.join(root, name) + '/')


Contents of /kaggle/input/carrier-on-time-performance-dataset:
/kaggle/input/carrier-on-time-performance-dataset/airline_2m.csv


### Step 1: Load Kaggle Dataset into Pandas DataFrame
First, we'll load the flight data from the downloaded Kaggle dataset into a Pandas DataFrame. We'll assume the main data file is named `flights.csv` within the Kaggle download directory.

In [None]:
import pandas as pd
import os
from google.cloud import bigquery

# 'path' variable is from the kagglehub.dataset_download cell
kaggle_csv_file = os.path.join(path, 'airline_2m.csv') # Corrected filename

try:
    # Read the CSV into a Pandas DataFrame, trying 'latin1' encoding
    df_kaggle = pd.read_csv(kaggle_csv_file, encoding='latin1')
    print(f"Successfully loaded {kaggle_csv_file} into a Pandas DataFrame.")
    display(df_kaggle.head())

    # Define the target BigQuery dataset and table
    NEW_BQ_DATASET_ID = f"{PROJECT_ID}.unit2_flights_kaggle"
    NEW_BQ_TABLE_ID = "flights_from_kaggle"
    NEW_FULL_TABLE_PATH = f"{NEW_BQ_DATASET_ID}.{NEW_BQ_TABLE_ID}"

    # Initialize BigQuery client
    client = bigquery.Client(project=PROJECT_ID)

    # Create dataset if it doesn't exist
    try:
        client.get_dataset(NEW_BQ_DATASET_ID.split('.')[-1])  # API request to check if dataset exists
        print(f"BigQuery Dataset {NEW_BQ_DATASET_ID} already exists.")
    except Exception:
        dataset = bigquery.Dataset(NEW_BQ_DATASET_ID)  # Construct BigQuery dataset object
        dataset.location = REGION  # Set the location (from notebook variable)
        client.create_dataset(dataset, timeout=30)  # Make API request to create dataset
        print(f"Created BigQuery dataset {client.project}.{dataset.dataset_id}")

    # Upload DataFrame to BigQuery table
    print(f"Uploading data to {NEW_FULL_TABLE_PATH}...")
    # Use if_exists='replace' for development, consider 'append' or 'fail' for production
    df_kaggle.to_gbq(NEW_FULL_TABLE_PATH, project_id=PROJECT_ID, if_exists='replace')
    print(f"Successfully uploaded data to {NEW_FULL_TABLE_PATH}.")

    # Verify by querying a few rows from the newly created BigQuery table
    print("Previewing data from new BigQuery table:")
    preview_new_table_sql = f"SELECT * FROM `{NEW_FULL_TABLE_PATH}` LIMIT 5"
    display(client.query(preview_new_table_sql).result().to_dataframe())

    print(f"\n--- IMPORTANT: New BigQuery Table Path ---\n{NEW_FULL_TABLE_PATH}\n-----------------------------------------")
    print(f"Please manually update the `TABLE_PATH` variable in cell `3c294930` to: '{NEW_FULL_TABLE_PATH}' and re-run that cell, then continue with the notebook.")

except FileNotFoundError:
    print(f"Error: The file '{kaggle_csv_file}' was not found. Please verify the file name and path within the Kaggle download. You might need to list files in `path` to find the correct CSV name.")
except Exception as e:
    print(f"An error occurred during the process: {e}")

  df_kaggle = pd.read_csv(kaggle_csv_file, encoding='latin1')


Successfully loaded /kaggle/input/carrier-on-time-performance-dataset/airline_2m.csv into a Pandas DataFrame.


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,1998,1,1,2,5,1998-01-02,NW,19386,NW,N297US,...,,,,,,,,,,
1,2009,2,5,28,4,2009-05-28,FL,20437,FL,N946AT,...,,,,,,,,,,
2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,N665MQ,...,,,,,,,,,,
3,2010,3,8,31,2,2010-08-31,DL,19790,DL,N6705Y,...,,,,,,,,,,
4,2006,1,1,15,7,2006-01-15,US,20355,US,N504AU,...,,,,,,,,,,


BigQuery Dataset sapient-office-471119-g4.unit2_flights_kaggle already exists.
Uploading data to sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle...


  df_kaggle.to_gbq(NEW_FULL_TABLE_PATH, project_id=PROJECT_ID, if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 6775.94it/s]


Successfully uploaded data to sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle.
Previewing data from new BigQuery table:


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,1999,3,7,7,3,1999-07-07,DL,19790,DL,N402DA,...,,,,,,,,,,
1,1996,1,2,7,3,1996-02-07,AS,19930,AS,N744AS,...,,,,,,,,,,
2,1990,2,6,18,1,1990-06-18,DL,19790,DL,,...,,,,,,,,,,
3,2019,1,1,21,1,2019-01-21,G4,20368,G4,301NV,...,,,,,,,,,,
4,2018,1,2,22,4,2018-02-22,HA,19690,HA,N492HA,...,,,,,,,,,,



--- IMPORTANT: New BigQuery Table Path ---
sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle
-----------------------------------------
Please manually update the `TABLE_PATH` variable in cell `3c294930` to: 'sapient-office-471119-g4.unit2_flights_kaggle.flights_from_kaggle' and re-run that cell, then continue with the notebook.


### Quick sanity check

In [None]:
preview_sql = f"SELECT * FROM `{TABLE_PATH}` LIMIT 5"
bq.query(preview_sql).result().to_dataframe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,1999,3,7,7,3,1999-07-07,DL,19790,DL,N402DA,...,,,,,,,,,,
1,1996,1,2,7,3,1996-02-07,AS,19930,AS,N744AS,...,,,,,,,,,,
2,1990,2,6,18,1,1990-06-18,DL,19790,DL,,...,,,,,,,,,,
3,2019,1,1,21,1,2019-01-21,G4,20368,G4,301NV,...,,,,,,,,,,
4,2018,1,2,22,4,2018-02-22,HA,19690,HA,N492HA,...,,,,,,,,,,



## 1) Canonical mapping (adjust as needed)
Map to a minimal schema used in the rest of the notebook:
- `flight_date` (DATE), `dep_delay` (NUM), `distance` (NUM), `carrier` (STRING), `origin` (STRING), `dest` (STRING), `diverted` (BOOL)


In [None]:

# Adjust ONLY if your table uses different column names.
CANONICAL_BASE_SQL = f'''
WITH canonical_flights AS (
  SELECT
    CAST(FlightDate AS DATE) AS flight_date, # Corrected: removed 'date' from COALESCE
    CAST(DepDelay AS FLOAT64) AS dep_delay,
    CAST(distance  AS FLOAT64) AS distance,
    CAST(Reporting_Airline   AS STRING)  AS carrier, # Corrected: mapped to Reporting_Airline
    CAST(origin    AS STRING)  AS origin,
    CAST(dest AS STRING) AS dest, # Corrected: removed 'destination' from COALESCE
    CAST((CASE WHEN SAFE_CAST(diverted AS INT64)=1 OR LOWER(CAST(diverted AS STRING))='true' THEN TRUE ELSE FALSE END) AS BOOL) AS diverted
  FROM `{TABLE_PATH}`
  WHERE DepDelay IS NOT NULL
)
'''
print(CANONICAL_BASE_SQL[:600] + "\n...")


WITH canonical_flights AS (
  SELECT
    CAST(FlightDate AS DATE) AS flight_date, # Corrected: removed 'date' from COALESCE
    CAST(DepDelay AS FLOAT64) AS dep_delay,
    CAST(distance  AS FLOAT64) AS distance,
    CAST(Reporting_Airline   AS STRING)  AS carrier, # Corrected: mapped to Reporting_Airline
    CAST(origin    AS STRING)  AS origin,
    CAST(dest AS STRING) AS dest, # Corrected: removed 'destination' from COALESCE
    CAST((CASE WHEN SAFE_CAST(diverted AS INT64)=1 OR LOWER(CAST(diverted AS STRING))='true' THEN TRUE ELSE FALSE END) AS BOOL) AS diverted
  FROM `sapient-office-47111
...


### 2) Split (80/20)

In [None]:

SPLIT_CLAUSE = r'''
, split AS (
  SELECT cf.*,
         CASE WHEN RAND() < 0.8 THEN 'TRAIN' ELSE 'EVAL' END AS data_split
  FROM canonical_flights cf
)
'''
print(SPLIT_CLAUSE)



, split AS (
  SELECT cf.*,
         CASE WHEN RAND() < 0.8 THEN 'TRAIN' ELSE 'EVAL' END AS data_split
  FROM canonical_flights cf
)




## 3) Baseline model — LOGISTIC_REG (`diverted`)
Use **only** a small set of signals for the baseline (keep it honest).


In [None]:
client = bq.Client(project=PROJECT_ID)

print(f"Listing datasets in project: {PROJECT_ID}")
datasets = list(client.list_datasets())

if datasets:
    for dataset in datasets:
        # Fetch dataset properties to get the location
        full_dataset = client.get_dataset(dataset.reference)
        print(f"Dataset ID: {full_dataset.dataset_id}, Location: {full_dataset.location}")
else:
    print(f"No datasets found in project {PROJECT_ID}.")


Listing datasets in project: sapient-office-471119-g4
Dataset ID: churn_dataset, Location: US
Dataset ID: churn_modeling, Location: US
Dataset ID: netflix, Location: US
Dataset ID: superstore_data, Location: US
Dataset ID: unit2_flights, Location: US
Dataset ID: unit2_flights_kaggle, Location: us-central1


In [None]:

MODEL_BASE = f"{PROJECT_ID}.unit2_flights_kaggle.clf_diverted_base"

# 1. Create Schema (if not exists) - REMOVED from here, now in 3c294930
# schema_sql = f"CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.unit2_flights`;"
# job = bq.query(schema_sql); _ = job.result()
# print(f"Schema '{PROJECT_ID}.unit2_flights' ensured.")

# Re-initialize bq client to ensure it recognizes the new dataset - Removed this line to rely on the global bq client from 3c294930
# bq = bigquery.Client(project=PROJECT_ID, location=REGION)

# 2. Create or Replace Model
create_model_sql = f'''
CREATE OR REPLACE MODEL `{MODEL_BASE}`
OPTIONS (MODEL_TYPE='LOGISTIC_REG', INPUT_LABEL_COLS=['diverted']) AS
{CANONICAL_BASE_SQL}
{SPLIT_CLAUSE}
SELECT
  diverted,
  dep_delay, distance, carrier, origin, dest,
  EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week
FROM split
WHERE data_split='TRAIN'
;'''
job = bq.query(create_model_sql, location=REGION); _ = job.result()
print("Baseline model trained:", MODEL_BASE)

# 3. Evaluate Model
evaluate_model_sql = f'''
SELECT * FROM ML.EVALUATE(
  MODEL `{MODEL_BASE}`,
  ({CANONICAL_BASE_SQL}
   {SPLIT_CLAUSE}
   SELECT
     diverted,
     dep_delay, distance, carrier, origin, dest,
     EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week
   FROM split WHERE data_split='EVAL')
);
'''
df_baseline_eval = bq.query(evaluate_model_sql, location=REGION).result().to_dataframe()
display(df_baseline_eval)


Baseline model trained: sapient-office-471119-g4.unit2_flights_kaggle.clf_diverted_base


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.0,0.0,0.997712,0.0,0.015425,0.717939


### Confusion matrix — your custom threshold

In [None]:

CUSTOM_THRESHOLD = 0.75  # TODO: justify in ops terms

cm_thresh_sql = f'''
{CANONICAL_BASE_SQL}
{SPLIT_CLAUSE}

WITH scored AS (
  SELECT
    cf.diverted AS label,
    CAST(score >= {CUSTOM_THRESHOLD} AS BOOL) AS pred_label
  FROM (
    SELECT cf.*, p.predicted_diverted_probs[OFFSET(0)].prob AS score
    FROM split cf
    JOIN ML.PREDICT(MODEL `{MODEL_BASE}`,
          (SELECT dep_delay, distance, carrier, origin, dest, EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week
           FROM split WHERE data_split='EVAL')) AS p # Updated WHERE clause
    ON TRUE
    WHERE cf.data_split='EVAL' # Updated WHERE clause
  )
)
SELECT
  SUM(CASE WHEN label=TRUE  AND pred_label=TRUE  THEN 1 ELSE 0 END) AS TP,
  SUM(CASE WHEN label=FALSE AND pred_label=TRUE  THEN 1 ELSE 0 END) AS FP,
  SUM(CASE WHEN label=TRUE  AND pred_label=FALSE THEN 1 ELSE 0 END) AS FN,
  SUM(CASE WHEN label=FALSE AND pred_label=FALSE THEN 1 ELSE 0 END) AS TN
FROM scored;
'''
bq.query(cm_thresh_sql, location=REGION).result().to_dataframe() # Added location



## 4) Engineered model — `TRANSFORM` (same label, stricter bar)
Create **route**, extract **day_of_week**, and **bucketize dep_delay**. Compare metrics to baseline.


In [None]:

MODEL_XFORM = f"{PROJECT_ID}.unit2_flights.clf_diverted_xform"

sql_xform = f'''
{CANONICAL_BASE_SQL}
{SPLIT_CLAUSE}

CREATE OR REPLACE MODEL `{MODEL_XFORM}`
TRANSFORM (
  CONCAT(origin, '-', dest) AS route,
  EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week,
  CASE
    WHEN dep_delay < -5  THEN 'early'
    WHEN dep_delay <=  5 THEN 'on_time'
    WHEN dep_delay <= 15 THEN 'minor'
    WHEN dep_delay <= 45 THEN 'moderate'
    ELSE 'major'
  END AS dep_delay_bucket,
  dep_delay, distance, carrier, origin, dest
)
OPTIONS (MODEL_TYPE='LOGISTIC_REG', INPUT_LABEL_COLS=['diverted']) AS
SELECT * FROM split WHERE data_split='TRAIN' # Updated WHERE clause
;

SELECT 'baseline' AS model_version, * FROM ML.EVALUATE(
  MODEL `{MODEL_BASE}`,
  ({CANONICAL_BASE_SQL}
   {SPLIT_CLAUSE}
   SELECT
     diverted,
     dep_delay, distance, carrier, origin, dest,
     EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week
   FROM split WHERE data_split='EVAL') # Updated WHERE clause
)
UNION ALL
SELECT 'engineered' AS model_version, * FROM ML.EVALUATE(
  MODEL `{MODEL_XFORM}`,
  ({CANONICAL_BASE_SQL}
   {SPLIT_CLAUSE}
   SELECT * FROM split WHERE data_split='EVAL') # Updated WHERE clause
);
'''
job = bq.query(sql_xform, location=REGION); _ = job.result()
print("Engineered model trained:", MODEL_XFORM)



### Write-up (concise)
- **Threshold chosen & ops rationale:** …  
- **Baseline vs engineered — observed changes in AUC/precision/recall:** …  
- **Risk framing:** cost of FP vs FN for diversion planning; what is your acceptable FN-rate? …


Accuracy: The model shows a very high accuracy (0.997712). However, this is likely misleading given the highly imbalanced nature of 'diverted' flights (very few flights are actually diverted). If only a tiny fraction of flights are diverted, a model that always predicts 'not diverted' would achieve high accuracy.
Precision (0.0) and Recall (0.0): These metrics are both 0.0, which indicates that the model is performing very poorly for the positive class (diverted). It means the model is either not identifying any diverted flights (recall) or when it does, those predictions are incorrect (precision), or both. This suggests that the model, in its current form, is not effectively predicting diverted flights.
ROC AUC (0.717939): An AUC of ~0.72 suggests the model has some ability to distinguish between positive and negative classes, which is better than random (0.5). However, the precision and recall indicate that this discriminative power isn't translating into good predictions at the default classification threshold.
Log Loss (0.015425): This is relatively low, which again points to the model being confident about its predictions, but combined with low precision/recall, reinforces the idea that it's confidently wrong about the positive class or simply not predicting it.