# Step 1: Read Model Training Data

In [21]:
import pandas as pd
import boto3
import s3fs

# Set your bucket and file path
bucket_name = 'college-of-ai'
file_key = 'MORTGAGE_LENDING_DEMO_DATA.csv'

# Read CSV directly from S3
df = pd.read_csv(f's3://{bucket_name}/{file_key}')
df.head()

Unnamed: 0,LOAN_ID,TS,LOAN_TYPE_NAME,LOAN_PURPOSE_NAME,APPLICANT_INCOME_000S,LOAN_AMOUNT_000S,COUNTY_NAME,MORTGAGERESPONSE
0,225846,2024-08-09 23:51:21.600,VA-guaranteed,Refinancing,,160,Erie County,1
1,298793,2024-02-15 10:42:48.960,VA-guaranteed,Refinancing,109.0,255,Erie County,1
2,456295,2024-05-17 06:29:48.480,Conventional,Home purchase,283.0,392,Westchester County,1
3,376334,2024-06-21 11:55:14.880,FHA-insured,Refinancing,43.0,173,Albany County,0
4,216409,2024-10-03 17:14:38.400,Conventional,Refinancing,209.0,255,Kings County,1


# Step 2: Import Libraries & Prepare Data

In [22]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Drop rows with missing target or required numeric/categorical values
df = df.dropna(subset=[
    "MORTGAGERESPONSE", # <-- This is your target variable?
    "APPLICANT_INCOME_000S",
    "LOAN_AMOUNT_000S",
    "LOAN_TYPE_NAME", # <-- This is a categorical column
    "LOAN_PURPOSE_NAME", # <-- This is a categorical column
    "COUNTY_NAME" # <-- This is a categorical column
])

# One-hot encode selected categorical columns
categorical_cols = ["LOAN_TYPE_NAME", "LOAN_PURPOSE_NAME", "COUNTY_NAME"] # <-- Add the remaining categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Filter to numeric columns only (drop anything suspicious like strings/dates)
X = df_encoded.drop(columns=["MORTGAGERESPONSE"]) # <-- Which column is the target variable?
X = X.select_dtypes(include=["number"]).copy()

# Drop rows with NaNs if any remain
X = X.dropna()
y = df_encoded.loc[X.index, "MORTGAGERESPONSE"] # <-- Match this to your target variable

# Step 3: Split into Train, Validation, and Holdout Sets

In [23]:
# First split: 80% temp (train + val), 20% holdout (0.2 is your first test_size)
X_temp, X_holdout, y_temp, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: 75% train, 25% val (0.25 is your second test_size)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Holdout: {len(X_holdout)}")

Train: 189447, Validation: 63149, Holdout: 63150


# Step 4: Train XGBoost Model using 'logloss' as the evaluation metric

In [24]:
# 'logloss' penalizes incorrect predictions with high confidence—ideal for binary classification.
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Step 5: Evaluate Model Performance on the Validation Set

In [25]:
# Use the trained XGBoost model to generate predictions on the validation set.
# These predictions will help us assess how well the model generalizes to unseen data.
y_pred = model.predict(X_val)

# Print a detailed classification report including precision, recall, F1-score, and support.
# This gives us insight into the model’s ability to correctly classify each class.
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:\n", classification_report(y_val, y_pred))

# Print the confusion matrix to visualize the number of true positives, false positives,
# true negatives, and false negatives. This helps understand model performance at a glance.
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# NOTE: We will also precompute and store these metrics in Step 9 when registering the model.

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98     14590
           1       0.99      1.00      0.99     48559

    accuracy                           0.99     63149
   macro avg       0.99      0.99      0.99     63149
weighted avg       0.99      0.99      0.99     63149

Confusion Matrix:
 [[14219   371]
 [  211 48348]]


# Step 6: Install Required Snowflake Python Libraries

In [26]:
# Install the Snowflake ML Python library
# NOTE: You may see some warnings about other packages (e.g., dask, jupyter-ai). 
# These are safe to ignore for this lab and won't impact functionality.
!pip install snowflake-ml-python --quiet

In [27]:
# Install the Snowpark Python library.
# This is used to connect to Snowflake and work with data using Python DataFrame operations.
!pip install snowflake-snowpark-python --quiet

In [28]:
# Install required libraries:
# - `toml` is used to read the Snowflake connection info from a `connections.toml` file.
# - `snowflake-ml-python` provides tools for model training, registration, and scoring in Snowflake.
!pip install toml snowflake-ml-python --quiet

# Step 7: Allow SageMaker to Access Snowflake
In order for your SageMaker instance to connect to Snowflake, you must **add the current public IP address** of this notebook to your Snowflake network policy.

### Step 7A: Get the Public IP of this SageMaker Instance

In [29]:
# Run the following cell in your notebook:
!curl ifconfig.me

52.27.224.182

### Step 7B: Add the IP to Your Snowflake Network Policy

Copy the Step 7A output (ex: `52.183.42.53`) and update your Snowflake network policy by running the following SQL in Snowsight (you must have `ACCOUNTADMIN` privileges):

#### Option 1: Create a new network policy (recommended if not already created)
```sql
-- Create it once
CREATE OR REPLACE NETWORK POLICY ALLOW_SAGEMAKER
  ALLOWED_IP_LIST = ('52.27.224.182')
  COMMENT = 'Restrict access to SageMaker IPs for MLOps HOL';
-- Assign to service user only
ALTER USER mlops_user SET NETWORK_POLICY = ALLOW_SAGEMAKER;
-- Verify assignment
DESC USER mlops_user;
```
#### Option 2: Append to an existing IP allowlist (preserve existing IPs)
```sql
-- If you already have other IPs in the list, you can append your SageMaker IP like this:
ALTER NETWORK POLICY ALLOW_SAGEMAKER SET ALLOWED_IP_LIST = (
  'existing.ip.1',
  'existing.ip.2',
  '52.183.42.53'
);
```

# Step 8: Connect to Snowflake from SageMaker using a .toml File

This notebook uses a local .toml file to securely store your Snowflake connection parameters.

### Step 8A: Create a connections.toml file
Create a connections.toml file locally with the following structure:
```
[connections.snowflake_conn]
account = "your_account"
user = "mlops_user"
role = "aicollege"
warehouse = "aicollege"
database = "aicollege"
schema = "public"
authenticator = "snowflake_jwt"
```
💡 Replace the values with your actual Snowflake connection details.

📝 Reminder: After updating the connections.toml file with your Snowflake account details, make sure to save the file before running the next step.
Otherwise, your notebook won’t be able to read the correct connection settings.

### Step 8B: Upload the `.toml` and `.pem` Files to SageMaker

In the left sidebar of this JupyterLab environment, click the 📁 **File Browser** icon.

Right-click → **Upload Files** and choose both of the following files:
- `connections.toml` (your Snowflake connection config)
- `rsa_private_key.pem` (your private key for key-pair authentication)

✅ Confirm both files appear in the file tree (e.g., `/connections.toml` and `/rsa_private_key.pem`).

🛠️ If your `.toml` references a custom path (like a `keys/` folder), ensure the `.pem` file is uploaded to that same path or adjust the `private_key_path` value accordingly.

### Step 8C: Validate your Snowpark with connections.toml

In [30]:
import toml
from cryptography.hazmat.primitives import serialization
from snowflake.snowpark import Session

# Load the Snowflake connection config
config = toml.load("connections.toml") # <-- Provide path to your connections.toml file
params = config["connections"]["Snowpark_MLOps_HOL"] # <-- Specify the connection key

# Manually load the private key
with open("rsa_private_key.pem", "rb") as key_file:
    private_key = serialization.load_pem_private_key(key_file.read(), password=None)

# Inject the private key into params
params["private_key"] = private_key

# Create a Snowpark session
session = Session.builder.configs(params).create()

# Test the connection
session.sql("SELECT current_user(), current_warehouse(), current_database(), current_schema()").show()

----------------------------------------------------------------------------------------
|"CURRENT_USER()"  |"CURRENT_WAREHOUSE()"  |"CURRENT_DATABASE()"  |"CURRENT_SCHEMA()"  |
----------------------------------------------------------------------------------------
|MLOPS_USER        |AICOLLEGE              |AICOLLEGE             |PUBLIC              |
----------------------------------------------------------------------------------------



# Step 9: Register SageMaker Model in Snowflake Model Registry
After training a model in SageMaker, you can register it directly in Snowflake using the `log_model()` API — **no need to upload a `.pkl` file to a stage**.

When you register a model in the Snowflake Model Registry:
- You pass the **Python model object directly** (e.g., XGBoost, scikit-learn) — no manual artifact staging required.
- The model is **automatically serialized, versioned, and stored as a first-class Snowflake object**.
- **Model metadata** — including name, version, dependencies, metrics, tags, and sample input — is **captured at registration**.
- You can **run inference at scale using Python or SQL** via `.run()` on any Snowflake table with compatible features.
- You can **track model versions, update tags and metrics, and monitor performance or drift** using **Snowflake ML Observability**.

The Snowflake Model Registry provides:
- **Built-in support for common ML frameworks** (e.g., XGBoost, scikit-learn, LightGBM, TensorFlow, PyTorch, Hugging Face, MLflow)
- **Role-based access control** (RBAC) and **schema-level organization**
- **Lifecycle management** from dev to prod
- **Secure sharing and governance** for ML teams

💡 Tip: Every Snowflake schema can act as a model registry. It's recommended to use a dedicated schema to organize your models.

### Step 9A: Create Small Input Data for Snowflake Model Registry
It helps Snowflake infer the input schema so it knows how to call the model later (during inference, for example). 

Using a small sample is a best practice to keep things fast and lightweight.

In [32]:
# Take a small sample from your training data (cast to float32) for model signature inference
sample_input_data = X_train.astype("float32").sample(5, random_state=42) # <-- Replace with training DataFrame

### Step 9B: Precompute metrics to log to the Snowflake Model Registry

In [33]:
# Capture model training metrics to include in Snowflake Model Registry log
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

acc = accuracy_score(y_val, y_pred)                           # <-- scalar metric
f1 = f1_score(y_val, y_pred)                                  # <-- scalar metric
report = classification_report(y_val, y_pred, output_dict=True)  # <-- JSON-serializable
cmatrix = confusion_matrix(y_val, y_pred).tolist()            # <-- list format

### Step 9C: Initialize the Snowflake Model Registry object

In [34]:
# This connects to the schema where the model will be stored
from snowflake.ml.registry import Registry

registry = Registry(                               
    session=session,                                # <-- your Snowpark session
    database_name='AICOLLEGE',                      # <-- your model database
    schema_name='PUBLIC'                            # <-- your model schema
)

### Step 9D: Log the trained model to Snowflake Model Registry

In [35]:
# This serializes the model, stores it in Snowflake, and adds initial metadata
from snowflake.ml.model import type_hints

model_version = registry.log_model(
    model=model,                                      # <-- Provide your trained XGBClassifier
    model_name='COLLEGE_AI_HOL_XGB_MORTGAGE_MODEL',   # <-- Provide required MLOPs HOL model name
    version_name='V1',                                # <-- Provide model version name
    sample_input_data=sample_input_data,              # <-- sample input for schema inference
    conda_dependencies=['xgboost'],                   # <-- Provide the predictive model dependencies
    comment="""XGBoost classifier trained in SageMaker to predict mortgage response.
This version uses 'logloss' as the evaluation metric and includes precomputed model metrics.""",
    metrics={
        "accuracy": acc,
        "f1_score": f1,
        "classification_report": report,
        "confusion_matrix": cmatrix
    },
    task=type_hints.Task.TABULAR_BINARY_CLASSIFICATION  # <-- Snowflake ML Observability
)

2025-07-07 16:14:33.711184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  self.manifest.save(


### Step 9E: Create and Apply Tags for Model Versioning

In [36]:
# Create optional tags for better governance and tracking
# Run once per schema (add missing ones if needed)
session.sql("CREATE OR REPLACE TAG MODEL_STAGE_TAG").collect()
session.sql("CREATE OR REPLACE TAG MODEL_PURPOSE_TAG").collect()
session.sql("CREATE OR REPLACE TAG SOURCE").collect()
session.sql("CREATE OR REPLACE TAG PROJECT").collect()

[Row(status='Tag PROJECT successfully created.')]

In [37]:
# Apply tags and model-level metadata
m = registry.get_model('COLLEGE_AI_HOL_XGB_MORTGAGE_MODEL')   # <-- Provide required MLOPs HOL model name

# Add model-level description
m.comment = "XGBoost model to predict mortgage approval. Trained in SageMaker and registered in Snowflake."

# Add model-level tags
m.set_tag("MODEL_STAGE_TAG", "PROD")                                 # <-- deployment_stage
m.set_tag("MODEL_PURPOSE_TAG", "Mortgage Response Classification")   # <-- business_context
m.set_tag("SOURCE", "SageMaker")                                     # <-- origin
m.set_tag("PROJECT", "College of AI - MLOps HOL")                    # <-- HOL traceability

# View tags
m.show_tags()

{'AICOLLEGE.PUBLIC.MODEL_PURPOSE_TAG': 'Mortgage Response Classification',
 'AICOLLEGE.PUBLIC.MODEL_STAGE_TAG': 'PROD',
 'AICOLLEGE.PUBLIC.PROJECT': 'College of AI - MLOps HOL',
 'AICOLLEGE.PUBLIC.SOURCE': 'SageMaker'}

### 🎓 DORA Grading Setup Reminder
If you haven't configured the `util_db.public.se_grader` function yet, please ensure you've completed the DORA setup steps outlined in the **[College of AI HOL Setup instructions](https://docs.google.com/document/d/1z-CG06Kt2dzV2bLxTDsP55qxQY6lzwtSDE5Azz4IwJs/edit?tab=t.0#heading=h.vgy1lc5t2roq)**.

If you're unsure whether it's already configured:
- Run `SHOW INTEGRATIONS;` to check for `dora_api_integration`.
- If missing, follow the provided setup script.

Once confirmed, proceed with the DORA evaluation steps below.

# Step 10: Evaluation Test # 50 Sagemaker Model Registration in Snowflake (SEAI50)

In [38]:
# SEAI50: Validate model was registered in Snowflake Model Registry
query = """
WITH models AS (
  SELECT "name" AS model_name
  FROM TABLE(RESULT_SCAN(LAST_QUERY_ID()))
)
SELECT util_db.public.se_grader(
    'SEAI50',
    (actual >= 1),
    actual,
    1,
    'Your model has been registered successfully!'
) AS graded_results
FROM (
    SELECT COUNT(*) AS actual
    FROM models
    WHERE model_name ILIKE '%COLLEGE_AI_HOL_XGB_MORTGAGE_MODEL%'
)
"""
# First run SHOW MODELS with a separate call
session.sql("SHOW MODELS IN DATABASE AICOLLEGE").collect()

# Then run the grading query
session.sql(query).show()

------------------------------------------------------
|"GRADED_RESULTS"                                    |
------------------------------------------------------
|[                                                   |
|  {                                                 |
|    "_": "✅",                                       |
|    "actual": 1,                                    |
|    "description": "Your model has been registe...  |
|    "expected": 1,                                  |
|    "passed": true,                                 |
|    "step": "SEAI50"                                |
|  }                                                 |
|]                                                   |
------------------------------------------------------



# Step 11: Preprocess Inference Data for Batch Scoring

Before we run batch inference, we need to make sure the incoming Snowflake data matches the **feature format the model was trained on**.

Our SageMaker model expects:
- All numeric features
- One-hot encoded columns for `LOAN_TYPE_NAME`, `LOAN_PURPOSE_NAME`, and `COUNTY_NAME`
- The same column names and order as the training data

In this step, we:
- Load `InferenceMortgageData` from Snowflake
- Apply one-hot encoding using Pandas
- Fill in any missing columns with 0s
- Reorder columns to match the model input

In [40]:
# Load Week 1 data from Snowflake into a Pandas DataFrame
raw_input = session.table("InferenceMortgageData").filter("WEEK = 1").to_pandas() # <-- Provide inference data table and filter for WEEK = 1

# Apply one-hot encoding on the same columns used during training
categorical_cols = ["LOAN_TYPE_NAME", "LOAN_PURPOSE_NAME", "COUNTY_NAME"] # <-- Provide required categorical column names for the required preprocessing step
encoded_input = pd.get_dummies(raw_input, columns=categorical_cols)

# Align the input with model training features (X_train.columns)
for col in X_train.columns:
    if col not in encoded_input.columns:
        encoded_input[col] = 0  # Add missing columns with default 0

# Reorder the columns to match training
encoded_input = encoded_input[X_train.columns]

# Step 12: Week 1 Mortgage Application Scoring

Mortgage application data is continuously collected and stored in Snowflake.
Rather than scoring in real-time, we use **batch inference** to evaluate applications in bulk — a common approach when scoring latency is not critical.

In this step, you will:
- **Run inference** on Week 1 application data using the registered XGBoost model
- **Format and rename the prediction output** for clarity
- **Join predictions with the true outcome** (MORTGAGERESPONSE) for monitoring
- **Save the results in a unified table** (PREDICTIONS_WITH_GROUND_TRUTH) for downstream analysis and model monitoring

This unified table enables better model observability and drift detection by tracking how model predictions align with true outcomes over time.

🧠 **Reminder:** Your SageMaker model expects the input features to be preprocessed exactly as they were during training. We’ve already one-hot encoded and aligned the inference data in Step 13 to match that format.

In [42]:
# Run Batch Inference on Week 1 Data and Save Unified Output Table

from snowflake.ml.registry import Registry
import numpy as np
import pandas as pd

# Connect to the Model Registry
registry = Registry(session=session, database_name="AICOLLEGE", schema_name="PUBLIC")
model = registry.get_model("COLLEGE_AI_HOL_XGB_MORTGAGE_MODEL")  # <-- Match your registered model name
model_version = model.version("V1")  # <-- Match your version model name

# Run inference on encoded data
predictions = model_version.run(encoded_input, function_name="predict")
proba_predictions = model_version.run(encoded_input, function_name="predict_proba")

# Create prediction columns
pred_series = pd.Series(np.squeeze(predictions), name="PREDICTION_RESPONSE")  # <-- Provide model prediction response name
score_series = pd.Series(np.array(proba_predictions)[:, 1], name="PREDICTION_SCORE")  # <-- Provide model prediction score name

# Combine with original raw input for observability
results_df = raw_input.copy()
results_df["WEEK"] = 1
results_df["PREDICTED_RESPONSE"] = pred_series
results_df["PREDICTED_SCORE"] = score_series

# Convert to Snowpark DataFrame and save to Snowflake
results = session.create_dataframe(results_df)
results.write.mode("overwrite").save_as_table("XXX")  # <-- Provide name of saved table with model prediction responses

# Preview results
results.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"WEEK_START_DATE"    |"WEEK"  |"LOAN_ID"  |"TS"     |"LOAN_TYPE_NAME"  |"LOAN_PURPOSE_NAME"  |"APPLICANT_INCOME_000S"  |"LOAN_AMOUNT_O80S"  |"COUNTY_NAME"  |"MORTGAGERESPONSE"  |"PREDICTED_RESPONSE"  |"PREDICTED_SCORE"      |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2024-12-20 00:00:00  |1       |361354     |20:15.4  |Conventional      |Refinancing          |59.8                     |293                 |Ulster County  |0                   |0                     |3.433541860431433e-05  |
|2024-12-20 00:00:00  |1       |361354     |20:15.4  |Conventional      |Refinancing        

# Step 13: Weekly Batch Scoring (Weeks 2–5)

To simulate ongoing model usage, we now run **batch inference** on new mortgage application data for **weeks 2 through 5** using the same model and preprocessing logic applied to week 1.

In this step, you will:
- **Load raw application data** for each week from Snowflake
- **Apply the same one-hot encoding logic** used during training
- **Ensure feature alignment** with the model input (add missing columns, reorder)
- **Run inference** using the registered SageMaker model
- **Join predictions with actual outcomes** (`MORTGAGERESPONSE`)
- **Append results** to the unified output table: `PREDICTIONS_WITH_GROUND_TRUTH`

🧠 **Note:** For model scoring to be accurate, your inference data must use the same preprocessing steps (like one-hot encoding) and column structure used during training.

This mirrors a common MLOps pattern, where **new incoming data** is scored in batches on a regular schedule and **stored in Snowflake** for:
- Dashboards & business reporting
- Model monitoring and drift detection
- Performance evaluation over time

In [43]:
# Run Batch Inference on Weeks 2–5 and Update Unified Output Table

import pandas as pd
import numpy as np

for week in range(2, 6):  # Weeks 2 through 5
    print(f"🔄 Processing week {week}...")

    # Load raw inference data from Snowflake
    raw_week_df = session.table("InferenceMortgageData").filter(f"WEEK = {week}").to_pandas()

    # Apply one-hot encoding to categorical columns
    encoded_week_df = pd.get_dummies(raw_week_df, columns=categorical_cols)

    # Add any missing columns expected by the model
    for col in X_train.columns:
        if col not in encoded_week_df.columns:
            encoded_week_df[col] = 0  # Add missing columns with default value 0

    # Reorder columns to match the training feature order
    encoded_week_df = encoded_week_df[X_train.columns]

    # Run batch inference using registered model
    week_preds = model_version.run(encoded_week_df, function_name="predict")

    # Get predicted scores (class 1 probabilities)
    week_scores = model_version.run(encoded_week_df, function_name="predict_proba")
    predicted_scores = pd.Series(np.array(week_scores)[:, 1], name="PREDICTED_SCORE")

    # Create prediction column
    pred_series = pd.Series(np.squeeze(week_preds), name="PREDICTED_RESPONSE")

    # Add prediction columns to the original raw input data
    raw_week_df["WEEK"] = week
    raw_week_df["PREDICTED_RESPONSE"] = pred_series
    raw_week_df["PREDICTED_SCORE"] = predicted_scores

    # Convert to Snowpark and append to the unified results table
    result_sp_df = session.create_dataframe(raw_week_df)
    result_sp_df.write.mode("append").save_as_table("PREDICTIONS_WITH_GROUND_TRUTH")

    print(f"✅ Week {week} scoring complete.")

🔄 Processing week 2...
✅ Week 2 scoring complete.
🔄 Processing week 3...
✅ Week 3 scoring complete.
🔄 Processing week 4...
✅ Week 4 scoring complete.
🔄 Processing week 5...
✅ Week 5 scoring complete.


# Step 14: Evaluation Test # 51 Sagemaker Model Batch Scoring Completed (SEAI51)

In [44]:
# DORA Validation: Week 5 predictions were stored
query = """
SELECT util_db.public.se_grader(
    'SEAI51',
    (actual >= 1),
    actual,
    5,
    '✅ Inference for Week 5 was completed and stored in Snowflake!'
) AS graded_results
FROM (
    SELECT COUNT(*) AS actual
    FROM AICOLLEGE.PUBLIC.PREDICTIONS_WITH_GROUND_TRUTH
    WHERE WEEK = 5
)
"""

# Run the query
session.sql(query).show()

------------------------------------------------------
|"GRADED_RESULTS"                                    |
------------------------------------------------------
|[                                                   |
|  {                                                 |
|    "_": "✅",                                       |
|    "actual": 200,                                  |
|    "description": "✅ Inference for Week 5 was ...  |
|    "expected": 5,                                  |
|    "passed": true,                                 |
|    "step": "SEAI51"                                |
|  }                                                 |
|]                                                   |
------------------------------------------------------



# Step 15: Full Table Scoring (All Weeks)
To support end-to-end workflows like **ML Observability** and **model retraining**, we now run batch inference over the **entire mortgage application dataset** in one go — scoring every row across all weeks.

This mirrors common MLOps use cases like:

- Rebuilding a model after new data arrives
- Comparing historical model performance across time
- Feeding labeled prediction data into Snowflake’s `MODEL_MONITOR` or `MODEL_REGISTRY`
- Creating performance dashboards using a single predictions table

**Tip**: This is a common pattern in production pipelines, where you may have a nightly or weekly job that scores all open cases or the entire dataset.

In [None]:
# Step 15: Full Table Scoring (All Weeks)
import pandas as pd
import numpy as np
from snowflake.ml.registry import Registry

# Load entire inference dataset from Snowflake
raw_all_df = session.table("AICOLLEGE.PUBLIC.INFERENCEMORTGAGEDATA").to_pandas()

# One-hot encode categorical columns using same logic as training
encoded_all_df = pd.get_dummies(raw_all_df, columns=categorical_cols)

# Ensure feature alignment (add missing columns and reorder)
for col in X_train.columns:
    if col not in encoded_all_df.columns:
        encoded_all_df[col] = 0  # Fill missing with 0
encoded_all_df = encoded_all_df[X_train.columns]  # Reorder to match training input

# Load registered model
registry = Registry(session=session, database_name="AICOLLEGE", schema_name="PUBLIC")
model = registry.get_model("COLLEGE_AI_HOL_XGB_MORTGAGE_MODEL")   # <-- Match your registered model name
model_version = model.version("V1")  # <-- Match your version model name

# Run prediction and probability
predictions = model_version.run(encoded_all_df, function_name="predict")
proba_predictions = model_version.run(encoded_all_df, function_name="predict_proba")

# Attach predictions to original data
raw_all_df["PREDICTED_RESPONSE"] = pd.Series(np.squeeze(predictions))  # <-- Provide model prediction response name
raw_all_df["PREDICTED_SCORE"] = pd.Series(np.array(proba_predictions)[:, 1])  # <-- Provide model prediction score name

# Save the fully scored dataset to Snowflake
full_results = session.create_dataframe(raw_all_df)
full_results.write.mode("overwrite").save_as_table("ALL_PREDICTIONS_WITH_GROUND_TRUTH")

# Preview
full_results.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"WEEK_START_DATE"    |"WEEK"  |"LOAN_ID"  |"TS"     |"LOAN_TYPE_NAME"  |"LOAN_PURPOSE_NAME"  |"APPLICANT_INCOME_000S"  |"LOAN_AMOUNT_O80S"  |"COUNTY_NAME"  |"MORTGAGERESPONSE"  |"PREDICTION_RESPONSE"  |"PREDICTION_SCORE"     |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2024-12-20 00:00:00  |1       |361354     |20:15.4  |Conventional      |Refinancing          |59.8                     |293                 |Ulster County  |0                   |0                      |3.433541860431433e-05  |
|2024-12-20 00:00:00  |1       |361354     |20:15.4  |Conventional      |Refinancing    