# SP500 Stock Demo — Notebook 03: Train + Register Model

- Generate 3‑month ahead target from hourly features
- Time-based split and train `XGBRegressor`
- Evaluate metrics (RMSE, MAPE, R2)
- Register model in Snowflake Model Registry


In [None]:
# 0) Imports and session/context
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col, lead, avg, sqrt
from snowflake.snowpark.functions import abs as sp_abs
from snowflake.snowpark.functions import pow as sp_pow
from snowflake.snowpark import Window
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.registry import Registry
from snowflake.ml.experiment.experiment_tracking import ExperimentTracking

session = get_active_session()
session.sql("USE WAREHOUSE DEMO_WH_M").collect()
session.sql("USE DATABASE SP500_STOCK_DEMO").collect()
session.sql("USE SCHEMA DATA").collect()

# Experiment Tracking
exp = ExperimentTracking(session=session)
exp.set_experiment("SP500_XGB_RET3M")

# Ensure we are using the V2 feature table with SECTOR enrichment
session.table('PRICE_FEATURES').limit(5).show()


In [None]:
# 1) Build supervised dataset: 3-month ahead target (approx 378 trading hours)
hourly = session.table('PRICE_FEATURES')
horizon_hours = 378
win_order = Window.partition_by('TICKER').order_by(col('TS'))

ds = (
    hourly
    .with_column('FUT_CLOSE', lead(col('CLOSE'), horizon_hours).over(win_order))
    .with_column('TARGET_PCT_3M', (col('FUT_CLOSE')/col('CLOSE') - 1))
    .drop('FUT_CLOSE')
    .filter(col('TARGET_PCT_3M').is_not_null())
)

ds.limit(5).show()


In [None]:
# 2) Train/test split by time (choose cutoff from supervised ds; fallback to ensure non-empty test)
from datetime import timedelta
from snowflake.snowpark.functions import max as sp_max

max_ts_ds = ds.select(sp_max(col('TS')).alias('mx')).collect()[0]['MX']
cutoff = None
train_df = None
test_df = None

# Try progressively older cutoffs to avoid empty test set due to lead horizon trimming
for days in [30, 60, 90, 120]:
    try_cutoff = max_ts_ds - timedelta(days=days)
    train_try = ds.filter(col('TS') < try_cutoff)
    test_try = ds.filter(col('TS') >= try_cutoff)
    trc = train_try.count()
    tec = test_try.count()
    if trc > 0 and tec > 0:
        cutoff = try_cutoff
        train_df, test_df = train_try, test_try
        break

# As a last resort, split by 80/20 time percentile if above failed
if cutoff is None:
    # Approximate by taking a coarse boundary in timestamp space
    # Use min/max from ds
    from snowflake.snowpark.functions import min as sp_min
    bounds = ds.select(sp_min(col('TS')).alias('mn'), sp_max(col('TS')).alias('mx')).collect()[0]
    mn_ts, mx_ts = bounds['MN'], bounds['MX']
    boundary = mn_ts + (mx_ts - mn_ts) * 0.8
    train_df = ds.filter(col('TS') < boundary)
    test_df = ds.filter(col('TS') >= boundary)
    cutoff = boundary

print({'cutoff': cutoff})
print('Train rows:', train_df.count())
print('Test rows:', test_df.count())


In [None]:
# 3) Define features and train XGBRegressor (+ compact hyperparam sweep with seaborn plot)
feature_cols = ['RET_1','SMA_5','SMA_20','VOL_20','RSI_PROXY','VOLUME','CLOSE']
label_col = 'TARGET_PCT_3M'
output_col = 'PREDICTED_RETURN'

# Print dataset sizes and feature count before training
print({
    'total_rows': ds.count(),
    'train_rows': train_df.count(),
    'test_rows': test_df.count(),
    'num_features': len(feature_cols)
})

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Notebook: use Snowflake ML metrics APIs directly (keyword-only args, singular names).
# Note: the server-side stored procedure path uses Snowpark-based metrics due to package availability.
def compute_metrics_quick(df, y_col: str, yhat_col: str) -> dict:
    rmse = mean_squared_error(df=df, y_true_col_names=y_col, y_pred_col_names=yhat_col, squared=False)
    mape = mean_absolute_percentage_error(df=df, y_true_col_names=y_col, y_pred_col_names=yhat_col)
    r2 = r2_score(df=df, y_true_col_name=y_col, y_pred_col_name=yhat_col)
    return {'rmse': rmse, 'mape': mape, 'r2': r2}

# Compact grid similar to reference (learning_rate vs mape; hue=n_estimators; facet by max_depth)
max_depth_opts = [4, 6]
learning_rate_opts = [0.1, 0.05]
n_estimators_opts = [100, 200]

rows = []
for depth in max_depth_opts:
    for lr in learning_rate_opts:
        for n in n_estimators_opts:
            run_name = f"xgb_depth={depth}_lr={lr}_n={n}"
            with exp.start_run(run_name):
                exp.log_params({
                    "model": "XGBRegressor",
                    "max_depth": int(depth),
                    "learning_rate": float(lr),
                    "n_estimators": int(n),
                    "subsample": 0.8,
                    "colsample_bytree": 0.8,
                    "feature_cols": feature_cols,
                    "label_col": label_col,
                    "output_col": output_col,
                    "time_split_cutoff": cutoff.isoformat() if cutoff else "auto",
                    "dataset_table": "PRICE_FEATURES",
                    "horizon_hours": int(horizon_hours),
                })

                xgb_tmp = XGBRegressor(
                    n_estimators=n,
                    max_depth=depth,
                    learning_rate=lr,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    input_cols=feature_cols,
                    label_cols=[label_col],
                    output_cols=[output_col],
                    random_state=42,
                )
                model_tmp = Pipeline(steps=[('xgb', xgb_tmp)]).fit(train_df)
                pred_tmp = model_tmp.predict(test_df).select(label_col, output_col)
                m = compute_metrics_quick(pred_tmp, label_col, output_col)
                rows.append({
                    'max_depth': depth,
                    'learning_rate': lr,
                    'n_estimators': n,
                    'rmse': m['rmse'],
                    'mape': m['mape'],
                    'r2': m['r2'],
                })

                exp.log_metrics({
                    "rmse": float(m["rmse"]),
                    "mape": float(m["mape"]),
                    "r2": float(m["r2"]),
                })

hp_df = pd.DataFrame(rows)
# Drop any null metric rows to avoid seaborn errors
hp_df = hp_df.dropna(subset=['mape', 'rmse']).sort_values('mape').reset_index(drop=True)

# Train final model with best params (by RMSE) if available
if not hp_df.empty:
    best = hp_df.sort_values('rmse').iloc[0].to_dict()
    xgb = XGBRegressor(
        n_estimators=int(best['n_estimators']),
        max_depth=int(best['max_depth']),
        learning_rate=float(best['learning_rate']),
        subsample=0.8,
        colsample_bytree=0.8,
        input_cols=feature_cols,
        label_cols=[label_col],
        output_cols=[output_col],
        random_state=42,
    )
    pipe = Pipeline(steps=[('xgb', xgb)])
    model = pipe.fit(train_df)
    print('Model trained with best config (by RMSE):', best)
else:
    # Fall back to baseline params if sweep failed
    best = {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}
    xgb = XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        input_cols=feature_cols,
        label_cols=[label_col],
        output_cols=[output_col],
        random_state=42,
    )
    pipe = Pipeline(steps=[('xgb', xgb)])
    model = pipe.fit(train_df)
    print('Sweep produced no metrics; trained baseline config:', best)

# Plot tuning curve similar to reference (learning_rate vs MAPE) if data available
if not hp_df.empty:
    sns.set_context('notebook', font_scale=0.9)
    # Use FacetGrid-like relplot to match style
    g = sns.relplot(
        data=hp_df,
        x='learning_rate', y='mape', hue='n_estimators', col='max_depth',
        kind='line', marker='o', height=4, aspect=1.2
    )
    g.set_titles('max_depth = {col_name}')
    g.set_axis_labels('learning_rate', 'MAPE')
else:
    print('Skipping tuning plot: no sweep results')

In [None]:
# 4b) (Disabled) Old sweep cell replaced by seaborn plot in cell 3
# Left intentionally minimal to avoid duplicate tuning runs.
pass


In [None]:
# 4) Evaluate metrics (RMSE, MAPE, R2) using robust helper
train_pred = model.predict(train_df).select(label_col, output_col)
test_pred = model.predict(test_df).select(label_col, output_col)

train_metrics = compute_metrics_quick(train_pred, label_col, output_col)
test_metrics = compute_metrics_quick(test_pred, label_col, output_col)

print({'train': train_metrics, 'test': test_metrics})

# Log final evaluation and selection as a run for traceability
with exp.start_run("final_best_selection"):
    exp.log_params({
        "selected_by": "rmse",
        "best_max_depth": int(best["max_depth"]) if isinstance(best["max_depth"], (int, float, str)) else best["max_depth"],
        "best_learning_rate": float(best["learning_rate"]),
        "best_n_estimators": int(best["n_estimators"]),
        "feature_cols": feature_cols,
        "label_col": label_col,
        "output_col": output_col,
        "time_split_cutoff": cutoff.isoformat() if cutoff else "auto",
        "dataset_table": "PRICE_FEATURES",
        "horizon_hours": int(horizon_hours),
    })

    exp.log_metrics({
        "train_rmse": float(train_metrics["rmse"]),
        "train_mape": float(train_metrics["mape"]),
        "train_r2": float(train_metrics["r2"]),
        "test_rmse": float(test_metrics["rmse"]),
        "test_mape": float(test_metrics["mape"]),
        "test_r2": float(test_metrics["r2"]),
    })


In [None]:
# 5) Register in Model Registry (enable Snowflake explainability)
reg = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA')
model_name = 'XGB_SP500_RET3M'

models_df = reg.show_models()
if models_df.empty or model_name not in models_df['name'].to_list():
    version = 'V_1'
else:
    import ast, builtins
    max_v = builtins.max([int(v.split('_')[-1]) for v in ast.literal_eval(models_df.loc[models_df['name']==model_name,'versions'].values[0])])
    version = f'V_{max_v+1}'

# Provide background data to enable SHAP explainability in the Model Registry
bg_pd = train_df.select(feature_cols).limit(1000).to_pandas()

mv = reg.log_model(
    model,
    model_name=model_name,
    version_name=version,
    conda_dependencies=['snowflake-ml-python'],
    comment='XGBRegressor predicting 3-month forward return from hourly features',
    metrics={
        'train_r2': train_metrics['r2'],
        'test_r2': test_metrics['r2'],
        'train_rmse': train_metrics['rmse'],
        'test_rmse': test_metrics['rmse'],
        'train_mape': train_metrics['mape'],
        'test_mape': test_metrics['mape'],
    },
    sample_input_data=bg_pd,
    options={'relax_version': False, 'enable_explainability': True}
)
reg.get_model(model_name).default = version

# Log registry pointers into the final run context
with exp.start_run("final_best_selection_regref"):
    exp.log_params({
        "registry_model_name": model_name,
        "registry_version": version,
    })

print({'model_name': model_name, 'version': version})


In [None]:
# 6) Snowflake explainability: compute SHAP-based global feature importance + seaborn bar plot
# explain() availability was enabled during logging (cell 5) using background data.
from snowflake.snowpark.functions import abs as sp_abs, col as sp_col
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

reg = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA')
mv_explain = reg.get_model('XGB_SP500_RET3M').default

# Take a small sample of inputs (features only) for explanations
sample_sp = ds.select(*feature_cols).limit(200)

# Run SHAP explain through Model Registry (Snowflake-built explainability)
explanations_sp = mv_explain.run(sample_sp, function_name='explain')

# Aggregate mean absolute SHAP per feature
ex_pd = explanations_sp.to_pandas()
cols = list(ex_pd.columns)
shap_map = {}
for f in feature_cols:
    if f in cols:
        shap_map[f] = f
    elif f"SHAP_{f}" in cols:
        shap_map[f] = f"SHAP_{f}"

imp_rows = []
for f, c in shap_map.items():
    imp = float(pd.Series(ex_pd[c]).abs().mean())
    imp_rows.append({'feature': f, 'mean_abs_shap': imp})
imp_df = pd.DataFrame(imp_rows).sort_values('mean_abs_shap', ascending=False)

# Persist and plot
session.create_dataframe(imp_df).write.save_as_table('FEATURE_SHAP_GLOBAL_TOP', mode='overwrite')

sns.set_context('notebook', font_scale=0.9)
plt.figure(figsize=(8, 4))
sns.barplot(data=imp_df, x='mean_abs_shap', y='feature', orient='h')
plt.title('Global feature importance (mean |SHAP|)')
plt.xlabel('Mean |SHAP|')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
print(imp_df.head(10))


In [None]:
# 6b) Explainability: feature importances (from sklearn view) — optional quick view
sk_pipe = model.to_sklearn()
# Find xgb step
xgb_step = None
for name, step in sk_pipe.named_steps.items():
    if 'xgb' in name.lower():
        xgb_step = step
        break

if xgb_step is not None and hasattr(xgb_step, 'feature_importances_'):
    import pandas as pd
    fi = pd.DataFrame({
        'feature': feature_cols,
        'importance': xgb_step.feature_importances_.tolist()[:len(feature_cols)]
    }).sort_values('importance', ascending=False)
    fi_sp = session.create_dataframe(fi)
    fi_sp.write.save_as_table('FEATURE_IMPORTANCE_SP500', mode='overwrite')
    print('Saved FEATURE_IMPORTANCE_SP500')
else:
    print('Feature importances not available from sklearn object.')
