In [1]:
# --- Minimal setup (edit 3 vars) ---
from google.colab import auth
auth.authenticate_user()

import os
from google.cloud import bigquery

PROJECT_ID = "mgmt-467-project-1"      # e.g., mgmt-467-47888
REGION     = "us-central1"
TABLE_PATH = "mgmt-467-project-1.flights.kaggle_flight_data"   # or your `bigquery-public-data.flights` table/view

os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"]     = REGION
bq = bigquery.Client(project=PROJECT_ID)

print("BQ Project:", PROJECT_ID)
print("Source table:", TABLE_PATH)


BQ Project: mgmt-467-project-1
Source table: mgmt-467-project-1.flights.kaggle_flight_data


In [2]:
preview_sql = f"SELECT * FROM `{TABLE_PATH}` LIMIT 5"
bq.query(preview_sql).result().to_dataframe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,1998,2,4,21,2,1998-04-21,DL,19790,DL,N334DL,...,,,,,,,,,,
1,1992,4,10,19,1,1992-10-19,DL,19790,DL,,...,,,,,,,,,,
2,2000,4,10,13,5,2000-10-13,DL,19790,DL,N225DL,...,,,,,,,,,,
3,1996,3,9,29,7,1996-09-29,DL,19790,DL,N331DL,...,,,,,,,,,,
4,1996,4,10,30,3,1996-10-30,DL,19790,DL,N236WA,...,,,,,,,,,,


# **Linear Regression Model**

# Task
Create a BigQuery `LINEAR_REG` model using `BQ.ML.CREATE_MODEL` to predict `ArrDelay` based on `DepDelay`, `Distance`, `Reporting_Airline`, `Origin`, `Dest`, and `DayOfWeek` from the `mgmt-467-project-1.flights.kaggle_flight_data` table.

## Prepare Training Data

### Subtask:
Construct a SQL query to select the necessary features and target variable ('ArrDelay') from the `kaggle_flight_data` table, handling any missing values or data types as needed for model training.


In [3]:
query = f"""SELECT ArrDelay, DepDelay, Distance, Reporting_Airline, Origin, Dest, DayOfWeek FROM `{TABLE_PATH}` WHERE ArrDelay IS NOT NULL AND DepDelay IS NOT NULL"""
df = bq.query(query).result().to_dataframe()
print(f"Shape of the prepared data: {df.shape}")
df.head()

Shape of the prepared data: (1958862, 7)


Unnamed: 0,ArrDelay,DepDelay,Distance,Reporting_Airline,Origin,Dest,DayOfWeek
0,19.0,0.0,692.0,DL,ATL,ABE,2
1,0.0,4.0,692.0,DL,ATL,ABE,7
2,7.0,-2.0,692.0,DL,ATL,ABE,7
3,17.0,-3.0,692.0,EV,ATL,ABE,6
4,-15.0,0.0,692.0,OH,ATL,ABE,3


## Create LINEAR_REG Model

### Subtask:
Generate and execute a BigQuery SQL query to create a `LINEAR_REG` model using `BQ.ML.CREATE_MODEL`. The model will predict 'ArrDelay' based on the features: 'DepDelay', 'Distance', 'Reporting_Airline', 'Origin', 'Dest', and 'DayOfWeek'.


In [4]:
model_name = "bqml_arrdelay_model"
create_model_query = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.flights.{model_name}`
OPTIONS(
    model_type='LINEAR_REG',
    input_label_cols=['ArrDelay']
)
AS
SELECT
    ArrDelay,
    DepDelay,
    Distance,
    Reporting_Airline,
    Origin,
    Dest,
    DayOfWeek
FROM
    `{TABLE_PATH}`
WHERE
    ArrDelay IS NOT NULL AND DepDelay IS NOT NULL
"""
bq.query(create_model_query).result()
print(f"BigQuery LINEAR_REG model '{model_name}' created successfully!")

BigQuery LINEAR_REG model 'bqml_arrdelay_model' created successfully!


### Evaluate the Model with ML.EVALUATE

In [5]:
evaluate_model_query = f"""SELECT * FROM ML.EVALUATE(MODEL `{PROJECT_ID}.flights.{model_name}`) """
evaluation_results = bq.query(evaluate_model_query).result().to_dataframe()
print("Model evaluation results:")
display(evaluation_results)

Model evaluation results:


Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,9.17431,568.936212,1.41345,6.467743,0.497171,0.497209


Our model's predictions, on average, are off by roughly 9 minutes. This is a good indicator for what *can* be possible when predicting the delay, but the MAE is sensitive to a couple really bad predictions and outliers. The median absolute error shows the normal delay for a 'typical' flight because it's less sensitive to those same bad predictions and outliers. This serves to inform airport staff that they should be ready at least 9 minutes before the flight is scheduled to arrive, but that they should be absolutely be ready for 'typical' flights without abundant external factors 6.5 minutes early.