# SP500 Stock Demo — Notebook 03: Train + Register Model

- Generate 3‑month ahead target from hourly features
- Time-based split and train `XGBRegressor`
- Evaluate metrics (RMSE, MAPE, R2)
- Register model in Snowflake Model Registry


In [None]:
# 0) Imports and session/context
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col, lead, avg, sqrt
from snowflake.snowpark.functions import abs as sp_abs
from snowflake.snowpark.functions import pow as sp_pow
from snowflake.snowpark import Window
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.registry import Registry

session = get_active_session()
session.sql("USE WAREHOUSE DEMO_WH_M").collect()
session.sql("USE DATABASE SP500_STOCK_DEMO").collect()
session.sql("USE SCHEMA DATA").collect()

session.table('PRICE_FEATURES').limit(5).show()


In [None]:
# 1) Build supervised dataset: 3-month ahead target (approx 378 trading hours)
hourly = session.table('PRICE_FEATURES')
horizon_hours = 378
win_order = Window.partition_by('TICKER').order_by(col('TS'))

ds = (
    hourly
    .with_column('FUT_CLOSE', lead(col('CLOSE'), horizon_hours).over(win_order))
    .with_column('TARGET_PCT_3M', (col('FUT_CLOSE')/col('CLOSE') - 1))
    .drop('FUT_CLOSE')
    .filter(col('TARGET_PCT_3M').is_not_null())
)

ds.limit(5).show()


In [None]:
# 2) Train/test split by time (last 30 days for test)
cutoff = session.sql("select dateadd('day', -30, max(TS)) as c from PRICE_FEATURES").collect()[0]['C']
train_df = ds.filter(col('TS') < cutoff)
test_df = ds.filter(col('TS') >= cutoff)

print({'cutoff': cutoff})
print('Train rows:', train_df.count())
print('Test rows:', test_df.count())


In [None]:
# 3) Define features and train XGBRegressor
feature_cols = ['RET_1','SMA_5','SMA_20','VOL_20','RSI_PROXY','VOLUME','CLOSE']
label_col = 'TARGET_PCT_3M'
output_col = 'PREDICTED_RETURN'

# Print dataset sizes and feature count before training
print({
    'total_rows': ds.count(),
    'train_rows': train_df.count(),
    'test_rows': test_df.count(),
    'num_features': len(feature_cols)
})

xgb = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    input_cols=feature_cols,
    label_cols=[label_col],
    output_cols=[output_col],
    random_state=42,
)

pipe = Pipeline(steps=[('xgb', xgb)])
model = pipe.fit(train_df)
print('Model trained.')


In [None]:
# 4) Evaluate metrics (RMSE, MAPE, R2)

def compute_metrics(df, y_col: str, yhat_col: str) -> dict:
    rmse = df.select(sqrt(avg(sp_pow(col(y_col) - col(yhat_col), 2))).alias('rmse')).collect()[0]['RMSE']
    mape = df.select(avg(sp_abs((col(y_col) - col(yhat_col)) / (col(y_col) + 1e-9))).alias('mape')).collect()[0]['MAPE']
    mean_y = df.select(avg(col(y_col)).alias('mean_y')).collect()[0]['MEAN_Y']
    sse = df.select(avg(sp_pow(col(y_col) - col(yhat_col), 2)).alias('mse')).collect()[0]['MSE']
    sst = df.select(avg(sp_pow(col(y_col) - mean_y, 2)).alias('var')).collect()[0]['VAR']
    r2 = 1.0 - (sse / sst if sst and sst != 0 else 0.0)
    return {'rmse': rmse, 'mape': mape, 'r2': r2}

train_pred = model.predict(train_df).select(label_col, output_col)
test_pred = model.predict(test_df).select(label_col, output_col)

train_metrics = compute_metrics(train_pred, label_col, output_col)
test_metrics = compute_metrics(test_pred, label_col, output_col)

print({'train': train_metrics, 'test': test_metrics})


In [None]:
# 5) Register in Model Registry (enable Snowflake explainability)
reg = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA')
model_name = 'XGB_SP500_RET3M'

models_df = reg.show_models()
if models_df.empty or model_name not in models_df['name'].to_list():
    version = 'V_1'
else:
    import ast, builtins
    max_v = builtins.max([int(v.split('_')[-1]) for v in ast.literal_eval(models_df.loc[models_df['name']==model_name,'versions'].values[0])])
    version = f'V_{max_v+1}'

# Provide background data to enable SHAP explainability in the Model Registry
bg_pd = train_df.select(feature_cols).limit(1000).to_pandas()

mv = reg.log_model(
    model,
    model_name=model_name,
    version_name=version,
    conda_dependencies=['snowflake-ml-python'],
    comment='XGBRegressor predicting 3-month forward return from hourly features',
    metrics={
        'train_r2': train_metrics['r2'],
        'test_r2': test_metrics['r2'],
        'train_rmse': train_metrics['rmse'],
        'test_rmse': test_metrics['rmse'],
        'train_mape': train_metrics['mape'],
        'test_mape': test_metrics['mape'],
    },
    sample_input_data=bg_pd,
    options={'relax_version': False, 'enable_explainability': True}
)
reg.get_model(model_name).default = version

print({'model_name': model_name, 'version': version})


In [None]:
# 6) Snowflake explainability: compute SHAP-based global feature importance
# explain() availability was enabled during logging (cell 5) using background data.
from snowflake.snowpark.functions import abs as sp_abs, col as sp_col

reg = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA')
mv_explain = reg.get_model('XGB_SP500_RET3M').default

# Take a small sample of inputs (features only) for explanations
sample_sp = ds.select(*feature_cols).limit(200)

# Run SHAP explain through Model Registry (Snowflake-built explainability)
explanations_sp = mv_explain.run(sample_sp, function_name='explain')

# Aggregate mean absolute SHAP per feature
ex_pd = explanations_sp.to_pandas()
cols = list(ex_pd.columns)
shap_map = {}
for f in feature_cols:
    if f in cols:
        shap_map[f] = f
    elif f"SHAP_{f}" in cols:
        shap_map[f] = f"SHAP_{f}"

import pandas as pd
imp_rows = []
for f, c in shap_map.items():
    imp = float(pd.Series(ex_pd[c]).abs().mean())
    imp_rows.append({'feature': f, 'mean_abs_shap': imp})
imp_df = pd.DataFrame(imp_rows).sort_values('mean_abs_shap', ascending=False)

session.create_dataframe(imp_df).write.save_as_table('FEATURE_SHAP_GLOBAL_TOP', mode='overwrite')
print(imp_df.head(10))


In [None]:
# 6) Explainability: feature importances (quick view)
try:
    sk_pipe = model.to_sklearn()
    # If using Pipeline, get the xgb step name accordingly
    xgb_step = None
    for name, step in sk_pipe.named_steps.items():
        if 'xgb' in name.lower():
            xgb_step = step
            break
    if xgb_step is not None and hasattr(xgb_step, 'feature_importances_'):
        import pandas as pd
        fi = pd.DataFrame({
            'feature': feature_cols,
            'importance': xgb_step.feature_importances_.tolist()[:len(feature_cols)]
        }).sort_values('importance', ascending=False)
        fi_sp = session.create_dataframe(fi)
        fi_sp.write.save_as_table('FEATURE_IMPORTANCE_SP500', mode='overwrite')
        print('Saved FEATURE_IMPORTANCE_SP500')
    else:
        print('Feature importances not available from sklearn object.')
except Exception as e:
    print('Explainability step skipped:', e)
