# Imports 

In [5]:
import json
import pickle
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional, Tuple

import numpy as np
import pandas as pd
import logging
import os
import sys, importlib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from connections import SnowflakeConnection
from snowflake.snowpark.context import get_active_session

from snowflake.ml.registry import Registry

from snowflake.ml.model import target_platform as snow_target_platform
TargetPlatform = snow_target_platform.TargetPlatform


In [6]:
_LOGGER_CONFIGURED = False

DEFAULT_LOG_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
DEFAULT_LOG_LEVEL = logging.INFO
ENV_VAR_LOG_LEVEL = "MODEL_REGISTRY_LOG_LEVEL"


def _normalize_level(value: str, fallback: int) -> int:
    """Resolve a logging level string or numeric value to an int."""
    if value is None:
        return fallback

    if isinstance(value, str):
        level = logging.getLevelName(value.upper())
        if isinstance(level, int):
            return level
    elif isinstance(value, int):
        return value

    return fallback


def setup_logging(
    default_level: int = DEFAULT_LOG_LEVEL,
    env_var: str = ENV_VAR_LOG_LEVEL,
    force: bool = False,
) -> None:
    """Configure root logging once for scripts and notebooks."""
    global _LOGGER_CONFIGURED

    if _LOGGER_CONFIGURED and not force:
        return

    env_level = os.getenv(env_var)
    level = _normalize_level(env_level, default_level)

    root_logger = logging.getLogger()

    if force:
        for handler in root_logger.handlers[:]:
            root_logger.removeHandler(handler)

    if not root_logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(DEFAULT_LOG_FORMAT, DEFAULT_DATE_FORMAT))
        root_logger.addHandler(handler)

    root_logger.setLevel(level)
    _LOGGER_CONFIGURED = True


def get_logger(name: Optional[str] = None, level: Optional[int] = None) -> logging.Logger:
    """Return a configured logger for the given name."""
    setup_logging()
    logger = logging.getLogger(name)
    if level is not None:
        logger.setLevel(level)
    return logger


def log_section(
    logger: logging.Logger,
    title: str,
    level: int = logging.INFO,
    width: int = 80,
    pad_char: str = "=",
) -> None:
    """Log a section heading similar to the previous banner prints."""
    border = pad_char * width if pad_char else ""
    if border:
        logger.log(level, border)
    logger.log(level, title)
    if border:
        logger.log(level, border)


def set_global_level(level: int) -> None:
    """Allow callers to adjust root log level at runtime."""
    setup_logging(force=False)
    logging.getLogger().setLevel(level)

logger = get_logger(__name__)


# Data Config

In [7]:
@dataclass
class DataConfig:
    n_samples: int = 10000
    n_features: int = 20
    random_state: int = 42
    csv_path: Path = Path("synthetic_data.csv")
    upload_to_snowflake: bool = True
    connection_name: str = "legalzoom"
    database: str = "ML_SHOWCASE"
    data_schema: str = "DATA"
    table_name: str = "SYNTHETIC_DATA"


@dataclass
class TrainConfig:
    test_size: float = 0.2
    random_state: int = 42
    scaler_path: Path = Path("scaler.pkl")
    model_path: Path = Path("model.pkl")
    test_data_path: Path = Path("test_data.csv")
    metrics_path: Path = Path("model_metrics.json")


@dataclass
class RegistryConfig:
    connection_name: str = "legalzoom"
    database: str = "ML_SHOWCASE"
    schema: str = "MODELS"
    model_name: str = "LINEAR_REGRESSION_CUSTOM"
    user_files: Dict[str, list[str]] = field(default_factory=lambda: {"preprocessing": ["scaler.pkl"]})
    conda_dependencies: list[str] = field(
        default_factory=lambda: [
            "numpy==1.26.4",
            "pandas==2.1.4",
            "scikit-learn==1.5.2",
        ]
    )
    pip_requirements: list[str] = field(default_factory=list)
    artifact_repository_map: Optional[Dict[str, str]] = None
    resource_constraint: Optional[Dict[str, str]] = None
    python_version: str = "3.10"
    enable_explainability: bool = False
    target_platform_mode: str = "WAREHOUSE_ONLY"


@dataclass
class PipelineSteps:
    generate_data: bool = True
    train_model: bool = True
    verify_pickles: bool = True
    log_model: bool = True


@dataclass
class ServingConfig:
    enabled: bool = False
    compute_pool: Optional[str] = None
    service_name: Optional[str] = None
    min_instances: int = 1
    max_instances: int = 1
    instance_family: str = "CPU_X64_M"
    force_rebuild: bool = False
    drop_existing_service: bool = True
    num_workers: Optional[int] = None


@dataclass
class PipelineConfig:
    data: DataConfig = field(default_factory=DataConfig)
    train: TrainConfig = field(default_factory=TrainConfig)
    registry: RegistryConfig = field(default_factory=RegistryConfig)
    steps: PipelineSteps = field(default_factory=PipelineSteps)
    serving: ServingConfig = field(default_factory=ServingConfig)

# High Level Notebook Variables

In [8]:
base_cfg = {
    "data": {
        # dataset generation
        "n_samples": 5000,
        "n_features": 20,
        "random_state": 42,
        "csv_path": "notebook_synthetic_data.csv",
        "upload_to_snowflake": True,  # flip to True when ready
        "connection_name": "legalzoom",
        "database": "ML_SHOWCASE",
        "data_schema": "DATA",
        "table_name": "SYNTHETIC_DATA",
    },
    "train": {
        "test_size": 0.2,
        "random_state": 42,
        "scaler_path": "scaler.pkl",
        "model_path": "model.pkl",
        "test_data_path": "test_data.csv",
        "metrics_path": "model_metrics.json",
    },
    "registry": {
        "connection_name": "legalzoom",
        "database": "ML_SHOWCASE",
        "schema": "MODELS",
        "model_name": "LINEAR_REGRESSION_CUSTOM",
        "target_platform_mode": "SNOWPARK_CONTAINER_SERVICES_ONLY",
        "conda_dependencies": [],
        "pip_requirements": [
            "numpy==1.26.4",
            "pandas==2.1.4",
            "scikit-learn==1.5.2",
        ],
        # Provide these when targeting warehouses to satisfy pip installations:
        # "artifact_repository_map": {"pip": "snowflake.snowpark.pypi_shared_repository"},
        # "resource_constraint": {"class": "STANDARD_GEN_1"},
    },
    "steps": {
        "generate_data": True,
        "train_model": True,
        "verify_pickles": True,
        "log_model": False,  # set True once you're satisfied with the run
    },
    "serving": {
        "enabled": False,
        "compute_pool": "ML_INFERENCE_POOL",
        "service_name": "LINEAR_REGRESSION_SERVICE",
        "min_instances": 1,
        "max_instances": 1,
        "instance_family": "CPU_X64_M",
        "force_rebuild": True,
        "drop_existing_service": True,
    },
}


In [9]:
def pipeline_config_from_mapping(mapping: Dict[str, Any]) -> PipelineConfig:
    """Create a pipeline configuration from a nested mapping."""
    cfg = PipelineConfig()

    def _apply(target: Any, values: Dict[str, Any]) -> None:
        for key, value in values.items():
            if hasattr(target, key):
                current = getattr(target, key)
                if isinstance(current, Path) and not isinstance(value, Path):
                    setattr(target, key, Path(value))
                else:
                    setattr(target, key, value)

    if "data" in mapping:
        _apply(cfg.data, mapping["data"])
    if "train" in mapping:
        _apply(cfg.train, mapping["train"])
    if "registry" in mapping:
        _apply(cfg.registry, mapping["registry"])
    if "steps" in mapping:
        _apply(cfg.steps, mapping["steps"])
    if "serving" in mapping:
        _apply(cfg.serving, mapping["serving"])

    return cfg

In [10]:
pipeline_cfg = pipeline_config_from_mapping(base_cfg)
pipeline_cfg

PipelineConfig(data=DataConfig(n_samples=5000, n_features=20, random_state=42, csv_path=PosixPath('notebook_synthetic_data.csv'), upload_to_snowflake=True, connection_name='legalzoom', database='ML_SHOWCASE', data_schema='DATA', table_name='SYNTHETIC_DATA'), train=TrainConfig(test_size=0.2, random_state=42, scaler_path=PosixPath('scaler.pkl'), model_path=PosixPath('model.pkl'), test_data_path=PosixPath('test_data.csv'), metrics_path=PosixPath('model_metrics.json')), registry=RegistryConfig(connection_name='legalzoom', database='ML_SHOWCASE', schema='MODELS', model_name='LINEAR_REGRESSION_CUSTOM', user_files={'preprocessing': ['scaler.pkl']}, conda_dependencies=[], pip_requirements=['numpy==1.26.4', 'pandas==2.1.4', 'scikit-learn==1.5.2'], artifact_repository_map=None, resource_constraint=None, python_version='3.10', enable_explainability=False, target_platform_mode='SNOWPARK_CONTAINER_SERVICES_ONLY'), steps=PipelineSteps(generate_data=True, train_model=True, verify_pickles=True, log_mo

In [11]:
try:
    connection = SnowflakeConnection.from_snow_cli(pipeline_cfg.data.connection_name)
    session = connection.session
except ImportError:
    from snowflake.snowpark.context import get_active_session
    session = get_active_session()

2025-11-11 17:15:20 | INFO | snowflake.connector.connection | Snowflake Connector for Python Version: 3.18.0, Python Version: 3.10.19, Platform: macOS-15.7.2-x86_64-i386-64bit
2025-11-11 17:15:20 | INFO | snowflake.connector.connection | Connecting to GLOBAL Snowflake domain
2025-11-11 17:15:21 | INFO | snowflake.snowpark.session | Snowpark Session information: 
"version" : 1.42.0,
"python.version" : 3.10.19,
"python.connector.version" : 3.18.0,
"python.connector.session.id" : 21917694968801730,
"os.name" : Darwin



# Generate Data Set

In [12]:
def generate_synthetic_data(config: DataConfig) -> pd.DataFrame:
    """Generate synthetic regression data."""
    log_section(logger, "GENERATING SYNTHETIC DATASET")

    from sklearn.datasets import make_regression

    X, y = make_regression(
        n_samples=config.n_samples,
        n_features=config.n_features,
        n_informative=min(config.n_features, 15),
        n_targets=1,
        noise=10.0,
        bias=50.0,
        random_state=config.random_state,
    )

    feature_names = [f"FEATURE_{i:02d}" for i in range(config.n_features)]
    df = pd.DataFrame(X, columns=feature_names)
    df["TARGET"] = y
    df.insert(0, "ID", range(1, len(df) + 1))

    logger.info("Dataset summary: samples=%s, features=%s", f"{config.n_samples:,}", config.n_features)
    logger.info("Target mean=%.2f std=%.2f", df["TARGET"].mean(), df["TARGET"].std())
    return df

def save_to_csv(df: pd.DataFrame, path: Path) -> Path:
    """Persist dataframe to CSV."""
    log_section(logger, "SAVING DATA TO CSV")
    df.to_csv(path, index=False)
    logger.info("Saved data to %s (%.2f MB)", path, path.stat().st_size / (1024 * 1024))
    return path

def upload_to_snowflake(df: pd.DataFrame, config: DataConfig) -> Optional[str]:
    """Upload dataframe to Snowflake if requested."""
    if not config.upload_to_snowflake:
        logger.info("Snowflake upload skipped (upload_to_snowflake=False)")
        return None

    log_section(logger, "UPLOADING DATA TO SNOWFLAKE")
    try:
        connection = SnowflakeConnection.from_snow_cli(config.connection_name)
        session = connection.session
    except ImportError:
        from snowflake.snowpark.context import get_active_session
        session = get_active_session()
    try:
        session.sql(f"CREATE DATABASE IF NOT EXISTS {config.database}").collect()
        session.sql(f"CREATE SCHEMA IF NOT EXISTS {config.data_schema}").collect()
        session.sql(f"USE DATABASE {config.database}").collect()
        session.sql(f"USE SCHEMA {config.data_schema}").collect()

        session.create_dataframe(df).write.mode("overwrite").save_as_table(config.table_name)
        logger.info(
            "Uploaded data to %s.%s.%s", config.database, config.data_schema, config.table_name
        )
        return f"{config.database}.{config.data_schema}.{config.table_name}"
    finally:
        connection.close()

In [13]:
print(pipeline_cfg.data)

DataConfig(n_samples=5000, n_features=20, random_state=42, csv_path=PosixPath('notebook_synthetic_data.csv'), upload_to_snowflake=True, connection_name='legalzoom', database='ML_SHOWCASE', data_schema='DATA', table_name='SYNTHETIC_DATA')


In [14]:
outputs = {}

df = generate_synthetic_data(pipeline_cfg.data)
csv_path = save_to_csv(df, pipeline_cfg.data.csv_path)
table_name = upload_to_snowflake(df, pipeline_cfg.data)

outputs["dataframe"] = df
outputs["csv_path"] = csv_path
outputs["table_name"] = table_name

2025-11-11 17:15:28 | INFO | __main__ | GENERATING SYNTHETIC DATASET
2025-11-11 17:15:28 | INFO | __main__ | Dataset summary: samples=5,000, features=20
2025-11-11 17:15:28 | INFO | __main__ | Target mean=49.57 std=210.02
2025-11-11 17:15:28 | INFO | __main__ | SAVING DATA TO CSV
2025-11-11 17:15:28 | INFO | __main__ | Saved data to notebook_synthetic_data.csv (1.98 MB)
2025-11-11 17:15:28 | INFO | __main__ | UPLOADING DATA TO SNOWFLAKE
2025-11-11 17:15:29 | INFO | snowflake.connector.connection | Snowflake Connector for Python Version: 3.18.0, Python Version: 3.10.19, Platform: macOS-15.7.2-x86_64-i386-64bit
2025-11-11 17:15:29 | INFO | snowflake.connector.connection | Connecting to GLOBAL Snowflake domain
2025-11-11 17:15:29 | INFO | snowflake.snowpark.session | Snowpark Session information: 
"version" : 1.42.0,
"python.version" : 3.10.19,
"python.connector.version" : 3.18.0,
"python.connector.session.id" : 21917694968805346,
"os.name" : Darwin

2025-11-11 17:15:37 | INFO | __main__ 

In [15]:
session.sql("SELECT * FROM ML_SHOWCASE.DATA.SYNTHETIC_DATA LIMIT 5").toPandas()

Unnamed: 0,ID,FEATURE_00,FEATURE_01,FEATURE_02,FEATURE_03,FEATURE_04,FEATURE_05,FEATURE_06,FEATURE_07,FEATURE_08,...,FEATURE_11,FEATURE_12,FEATURE_13,FEATURE_14,FEATURE_15,FEATURE_16,FEATURE_17,FEATURE_18,FEATURE_19,TARGET
0,1,2.024612,0.232092,-2.465177,-0.866399,-1.21383,-2.229094,-0.253414,-0.202641,-1.506257,...,-1.206291,0.503526,0.654738,0.383037,-0.046785,1.317545,0.876958,0.453318,-0.389777,-240.944289
1,2,-0.10994,1.517878,-0.177179,0.605805,-0.813199,0.152118,-0.048728,0.035221,0.163817,...,0.555265,-3.419906,-0.520985,-1.521669,0.377092,-1.384809,1.153584,0.692579,0.520902,158.905363
2,3,-0.439888,1.62873,0.059572,-0.602144,1.220722,0.029977,-1.037918,-0.616253,-0.226677,...,-1.900659,0.868561,-1.054836,0.808803,-0.716726,0.310161,0.200068,-0.620061,0.932003,-0.705998
3,4,1.322548,1.623844,-0.859438,-2.094438,-0.431001,-1.48729,-1.9509,-0.050942,1.416834,...,-0.13118,-0.675938,-0.226175,-0.046749,-0.399696,-2.911804,0.774936,0.092381,-0.799743,-85.849133
4,5,0.830849,2.508861,-0.834636,-1.580244,0.030498,0.93957,0.71935,-0.008313,-0.118382,...,-0.089481,-0.146882,0.928356,0.382645,-0.109746,1.383421,1.12486,0.990663,-0.296109,186.936427


# Custom Z-Scaler & Training and Split

## Functions

In [16]:
import logging
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from typing import Dict, Any
import pickle
import os


In [17]:
def split_training_data(
    df: pd.DataFrame, config: TrainConfig
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """Split dataset into train/test folds."""
    features = [col for col in df.columns if col.startswith("FEATURE_")]
    X = df[features]
    y = df["TARGET"]
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=config.test_size,
        random_state=config.random_state,
    )
    logger.info(
        "Split data: train=%s, test=%s (test_size=%.0f%%)",
        f"{len(X_train):,}",
        f"{len(X_test):,}",
        config.test_size * 100,
    )
    return X_train, X_test, y_train, y_test


In [19]:
from custom_model import CustomZScaler

In [20]:

def train_model_with_preprocessing(X_train: pd.DataFrame, y_train: pd.Series):
    """
    Train model with preprocessing and save separate pickle files.
    
    This function:
    1. Creates and fits the custom Z-scaler
    2. Transforms training data
    3. Trains linear regression model
    4. Saves scaler and model as separate pickle files
    
    Parameters:
    -----------
    X_train : pd.DataFrame
        Training features
    y_train : pd.Series or np.ndarray
        Training targets
        
    Returns:
    --------
    scaler : CustomZScaler
        Fitted scaler
    model : LinearRegression
        Fitted model
    """
    log_section(logger, "TRAINING MODEL WITH PREPROCESSING")
    
    # Step 1: Create and fit the custom Z-scaler
    logger.info("1. Fitting Custom Z-Scaler...")
    scaler = CustomZScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    logger.info("   Scaler fitted")
    logger.info("   Features: %s", len(scaler.feature_names_))
    logger.info(
        "   Mean range: [%.2f, %.2f]",
        scaler.mean_.min(),
        scaler.mean_.max(),
    )
    logger.info(
        "   Std range: [%.2f, %.2f]",
        scaler.std_.min(),
        scaler.std_.max(),
    )
    
    # Step 2: Train linear regression model
    logger.info("2. Training Linear Regression Model...")
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    logger.info("   Model trained")
    logger.info("   Coefficients: %s", len(model.coef_))
    logger.info("   Intercept: %.2f", model.intercept_)
    
    # Step 3: Save as separate pickle files
    logger.info("3. Saving Components as Separate Pickle Files...")
    
    scaler_path = "scaler.pkl"
    model_path = "model.pkl"
    
    with open(scaler_path, "wb") as f:
        pickle.dump(scaler, f)
    logger.info("   Saved scaler to: %s", scaler_path)
    
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    logger.info("   Saved model to: %s", model_path)
    
    logger.info("Training complete!")
    logger.info("  Components saved as separate pickle files:")
    logger.info("    - %s (preprocessing)", scaler_path)
    logger.info("    - %s (model)", model_path)
    
    return scaler, model

In [21]:
def evaluate_model(
    scaler,
    model,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
) -> Dict[str, Dict[str, float]]:
    """Compute evaluation metrics for train/test sets."""
    log_section(logger, "EVALUATING MODEL")
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)

    metrics = {
        "train": {
            "mse": mean_squared_error(y_train, train_pred),
            "rmse": np.sqrt(mean_squared_error(y_train, train_pred)),
            "mae": mean_absolute_error(y_train, train_pred),
            "r2": r2_score(y_train, train_pred),
        },
        "test": {
            "mse": mean_squared_error(y_test, test_pred),
            "rmse": np.sqrt(mean_squared_error(y_test, test_pred)),
            "mae": mean_absolute_error(y_test, test_pred),
            "r2": r2_score(y_test, test_pred),
        },
    }

    for split in ("train", "test"):
        logger.info(
            "%s metrics: RMSE=%.4f MAE=%.4f R2=%.4f",
            split.capitalize(),
            metrics[split]["rmse"],
            metrics[split]["mae"],
            metrics[split]["r2"],
        )
    return metrics


## Train Model

In [22]:
pipeline_cfg.train

TrainConfig(test_size=0.2, random_state=42, scaler_path=PosixPath('scaler.pkl'), model_path=PosixPath('model.pkl'), test_data_path=PosixPath('test_data.csv'), metrics_path=PosixPath('model_metrics.json'))

In [23]:
X_train, X_test, y_train, y_test = split_training_data(df, pipeline_cfg.train)
scaler, model = train_model_with_preprocessing(X_train, y_train)

2025-11-11 17:17:24 | INFO | __main__ | Split data: train=4,000, test=1,000 (test_size=20%)
2025-11-11 17:17:24 | INFO | __main__ | TRAINING MODEL WITH PREPROCESSING
2025-11-11 17:17:24 | INFO | __main__ | 1. Fitting Custom Z-Scaler...
2025-11-11 17:17:24 | INFO | __main__ |    Scaler fitted
2025-11-11 17:17:24 | INFO | __main__ |    Features: 20
2025-11-11 17:17:24 | INFO | __main__ |    Mean range: [-0.03, 0.03]
2025-11-11 17:17:24 | INFO | __main__ |    Std range: [0.98, 1.03]
2025-11-11 17:17:24 | INFO | __main__ | 2. Training Linear Regression Model...
2025-11-11 17:17:24 | INFO | __main__ |    Model trained
2025-11-11 17:17:24 | INFO | __main__ |    Coefficients: 20
2025-11-11 17:17:24 | INFO | __main__ |    Intercept: 49.84
2025-11-11 17:17:24 | INFO | __main__ | 3. Saving Components as Separate Pickle Files...
2025-11-11 17:17:24 | INFO | __main__ |    Saved scaler to: scaler.pkl
2025-11-11 17:17:24 | INFO | __main__ |    Saved model to: model.pkl
2025-11-11 17:17:24 | INFO | _

In [24]:
! ls | grep pkl

model.pkl
scaler.pkl


In [25]:
metrics = evaluate_model(scaler, model, X_train, X_test, y_train, y_test)
metrics

2025-11-11 17:17:30 | INFO | __main__ | EVALUATING MODEL
2025-11-11 17:17:30 | INFO | __main__ | Train metrics: RMSE=9.9837 MAE=7.8813 R2=0.9978
2025-11-11 17:17:30 | INFO | __main__ | Test metrics: RMSE=10.1775 MAE=8.1768 R2=0.9976


{'train': {'mse': 99.67352813300454,
  'rmse': np.float64(9.983663061872859),
  'mae': 7.881317338310466,
  'r2': 0.9977512123073183},
 'test': {'mse': 103.582271504573,
  'rmse': np.float64(10.177537595340683),
  'mae': 8.176791003990894,
  'r2': 0.9976023228619049}}

In [26]:
from snowflake.ml.model import custom_model

model_ctx = custom_model.ModelContext(
    model_path=str(pipeline_cfg.train.model_path.resolve()),
    scaler_path=str(pipeline_cfg.train.scaler_path.resolve()),
)

import importlib
import types

def _ensure_custom_module_alias() -> None:
    if "custom_model" in sys.modules:
        sys.modules.setdefault("model_registry_showcase.custom_model", sys.modules["custom_model"])
        return
    module = None
    try:
        module = importlib.import_module("custom_model")
    except ModuleNotFoundError:
        try:
            module = importlib.import_module("model_registry_showcase.custom_model")
        except ModuleNotFoundError:
            module = _create_embedded_custom_module()
    sys.modules["custom_model"] = module
    sys.modules.setdefault("model_registry_showcase.custom_model", module)

def _create_embedded_custom_module() -> types.ModuleType:
    module = types.ModuleType("custom_model")
    module.CustomZScaler = CustomZScaler
    return module

class RegressionWithScaler(custom_model.CustomModel):
    def __init__(self, context):
        super().__init__(context)
        _ensure_custom_module_alias()
        with Path(self.context["model_path"]).open("rb") as f:
            self.model = pickle.load(f)
        with Path(self.context["scaler_path"]).open("rb") as f:
            self.scaler = pickle.load(f)

    @custom_model.inference_api
    def predict(self, input: pd.DataFrame) -> pd.DataFrame:
        feature_cols = [col for col in input.columns if col.startswith("FEATURE_")]
        X_scaled = self.scaler.transform(input[feature_cols])
        y_hat = self.model.predict(X_scaled)
        return pd.DataFrame({"PREDICTION": y_hat}, index=input.index)


custom_model_obj = RegressionWithScaler(model_ctx)


# Model Registry

In [None]:

def load_pickle(path: Path):
    """Utility to load a pickle file."""
    with path.open("rb") as fh:
        return pickle.load(fh)

In [None]:
log_section(logger, "INITIALIZING SNOWFLAKE REGISTRY")
registry = Registry(
    session=session,
    database_name=pipeline_cfg.registry.database,
    schema_name=pipeline_cfg.registry.schema,
)
logger.info("Registry ready at %s", registry.location)

In [None]:
sample_df = pd.read_csv(pipeline_cfg.train.test_data_path)
feature_cols = [col for col in sample_df.columns if col.startswith("FEATURE_")]
sample_data = sample_df[feature_cols].head(5)
sample_data

In [None]:
log_section(logger, "LOGGING MODEL VERSION")
model = load_pickle(Path("model.pkl"))
scaler = load_pickle(Path("scaler.pkl"))

In [None]:
for subdir, files in pipeline_cfg.registry.user_files.items():
    for file_name in files:
        path = Path(file_name)
        if not path.exists():
            raise FileNotFoundError(f"user_files entry not found: {path}")
        if path.stat().st_size > 5 * 1024 * 1024 * 1024:
            raise ValueError(f"user_files entry exceeds 5GB limit: {path}")

sample_scaled = pd.DataFrame(
    scaler.transform(sample_data),
    columns=sample_data.columns,
)
version_name = f"v_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

logger.info("Logging model %s version %s", pipeline_cfg.registry.model_name, version_name)

In [None]:
mode_token = (pipeline_cfg.registry.target_platform_mode or "").upper()
runs_in_warehouse = "WAREHOUSE" in mode_token if mode_token else True

import importlib, sys
custom_model_mod = importlib.import_module("custom_model")
sys.modules.setdefault("custom_model", custom_model_mod)
sys.modules.setdefault("model_registry_showcase.custom_model", custom_model_mod)

options = {
    "enable_explainability": pipeline_cfg.registry.enable_explainability,
    "target_methods": ["predict"],
}
if not pipeline_cfg.registry.pip_requirements and not pipeline_cfg.registry.conda_dependencies:
    options["relax_version"] = True

log_kwargs = dict(
    model=custom_model_obj,
    model_name=pipeline_cfg.registry.model_name,
    version_name=version_name,
    comment=(
        "Linear Regression with custom preprocessing. "
        f"Logged on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        "."
    ),
    metrics=metrics,
    python_version=pipeline_cfg.registry.python_version,
    sample_input_data=sample_scaled,
    options=options,
    ext_modules=[
        sys.modules[RegressionWithScaler.__module__],
        custom_model_mod,
    ],
)

if pipeline_cfg.registry.conda_dependencies:
    log_kwargs["conda_dependencies"] = pipeline_cfg.registry.conda_dependencies

if pipeline_cfg.registry.pip_requirements:
    log_kwargs["pip_requirements"] = pipeline_cfg.registry.pip_requirements
    log_kwargs["options"].pop("relax_version", None)

artifact_map = pipeline_cfg.registry.artifact_repository_map
if artifact_map and not runs_in_warehouse:
    logger.debug(
        "Ignoring artifact_repository_map because target platform mode %s does not include WAREHOUSE.",
        pipeline_cfg.registry.target_platform_mode,
    )
    artifact_map = None
if artifact_map is None and pipeline_cfg.registry.pip_requirements and runs_in_warehouse:
    artifact_map = {"pip": "snowflake.snowpark.pypi_shared_repository"}
if artifact_map:
    log_kwargs["artifact_repository_map"] = artifact_map

resource_constraint = pipeline_cfg.registry.resource_constraint if runs_in_warehouse else None
if resource_constraint:
    log_kwargs["resource_constraint"] = resource_constraint

target_platform_arg = None
if TargetPlatform is not None and pipeline_cfg.registry.target_platform_mode:
    target_platform_arg = getattr(TargetPlatform, pipeline_cfg.registry.target_platform_mode, None)
    if target_platform_arg is None:
        logger.warning(
            "Unknown target_platform_mode '%s'", pipeline_cfg.registry.target_platform_mode
        )
elif pipeline_cfg.registry.target_platform_mode:
    target_platform_arg = pipeline_cfg.registry.target_platform_mode

if target_platform_arg:
    log_kwargs["target_platform"] = target_platform_arg

model_version = registry.log_model(**log_kwargs)
logger.info("Logged model version: %s", model_version.version_name)

In [None]:
session.sql("SHOW MODELS IN DATABASE ML_SHOWCASE").toPandas()

In [None]:
from snowflake.ml.registry import registry

try:
    reg = registry.Registry(session=session, database_name='ML_SHOWCASE', schema_name='MODELS')
    mv = reg.get_model('LINEAR_REGRESSION_CUSTOM').version('V_20251110_190603')
    mv.run(sample_data, function_name='PREDICT') # Need to Deploy a SPCS Endpoint
except Exception as e:
    print(e)
    print("Need to Deploy a SPCS Endpoint")

# Deploy Service 

In [None]:
def ensure_compute_pool(session, serving_cfg: ServingConfig) -> None:
    if not serving_cfg.compute_pool:
        return

    create_sql = f"""
    CREATE COMPUTE POOL IF NOT EXISTS {serving_cfg.compute_pool}
        MIN_NODES = {serving_cfg.min_instances}
        MAX_NODES = {serving_cfg.max_instances}
        INSTANCE_FAMILY = '{serving_cfg.instance_family}'
    """
    session.sql(create_sql).collect()
    logger.info("Compute pool ensured: %s", serving_cfg.compute_pool)


In [None]:
from snowflake.ml.registry import registry
registry = registry.Registry(session=session, database_name='ML_SHOWCASE', schema_name='MODELS')
ensure_compute_pool(session,  pipeline_cfg.serving)

In [None]:
deploy_kwargs: Dict[str, Any] = {}
compute_pool = pipeline_cfg.serving.compute_pool
if compute_pool:
    deploy_kwargs["compute_pool"] = compute_pool

base_service_name = (
    pipeline_cfg.serving.service_name
    or f"{model_version.model_name}_{model_version.version_name}"
)
if "." in base_service_name:
    qualified_service_name = base_service_name
else:
    qualified_service_name = (
        f"{pipeline_cfg.registry.database}.{pipeline_cfg.registry.schema}.{base_service_name}"
    )

deploy_kwargs["service_name"] = qualified_service_name
if hasattr(pipeline_cfg.serving, "min_instances"):
    deploy_kwargs["min_instances"] = pipeline_cfg.serving.min_instances
if hasattr(pipeline_cfg.serving, "max_instances"):
    deploy_kwargs["max_instances"] = pipeline_cfg.serving.max_instances
if getattr(pipeline_cfg.serving, "force_rebuild", False):
    deploy_kwargs["force_rebuild"] = pipeline_cfg.serving.force_rebuild
if getattr(pipeline_cfg.serving, "num_workers", None) is not None:
    deploy_kwargs["num_workers"] = pipeline_cfg.serving.num_workers

if pipeline_cfg.serving.drop_existing_service:
    try:
        session.sql(f"DROP SERVICE IF EXISTS {qualified_service_name}").collect()
        logger.info("Dropped existing service if it existed: %s", qualified_service_name)
    except Exception as exc:
        logger.debug("Unable to drop existing service %s: %s", qualified_service_name, exc)

service = None
if hasattr(model_version, "deploy_to_snowpark_container_services"):
    service = model_version.deploy_to_snowpark_container_services(**deploy_kwargs)
elif hasattr(model_version, "deploy"):
    deploy_kwargs.setdefault("target_platform", "SNOWPARK_CONTAINER_SERVICES")
    service = model_version.deploy(**deploy_kwargs)
elif hasattr(model_version, "create_service"):
    if not compute_pool:
        raise ValueError("serving.compute_pool must be set to use create_service().")
    create_kwargs: Dict[str, Any] = {
        "service_name": qualified_service_name,
        "service_compute_pool": compute_pool,
        "ingress_enabled": True,
        "max_instances": pipeline_cfg.serving.max_instances,
    }
    if getattr(pipeline_cfg.serving, "force_rebuild", False):
        create_kwargs["force_rebuild"] = pipeline_cfg.serving.force_rebuild
    if getattr(pipeline_cfg.serving, "num_workers", None) is not None:
        create_kwargs["num_workers"] = pipeline_cfg.serving.num_workers
    service = model_version.create_service(**create_kwargs)
else:
    raise AttributeError("Snowflake ML SDK does not expose an SPCS deployment helper in this version.")

logger.info("Requested service deployment for %s", qualified_service_name)

In [None]:
from snowflake.ml.registry import Registry

# reuse the same registry/session you used to create the service
registry = Registry(session=session, database_name="ML_SHOWCASE", schema_name="MODELS")
mv = registry.get_model("LINEAR_REGRESSION_CUSTOM").default   # or pick a specific version

mv.show_functions()  # optional: see which inference functions are exposed


In [None]:
prediction_df = mv.run(
    sample_data,                      # pandas or Snowpark DataFrame
    function_name="predict",      # matches @custom_model.inference_api name
    service_name="LINEAR_REGRESSION_SERVICE"  # the service you created
)

In [None]:
sample_data

In [None]:
prediction_df