# SP500 Stock Demo — Notebook 05: Incremental Retraining + Compare

- Retrain model on most recent window
- Register new version if metrics improve
- Compare versions


In [None]:
# 0) Imports and session/context
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col, lead, avg, sqrt
from snowflake.snowpark.functions import abs as sp_abs
from snowflake.snowpark.functions import pow as sp_pow
from snowflake.snowpark import Window
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.registry import Registry

session = get_active_session()
session.sql("USE WAREHOUSE DEMO_WH_M").collect()
session.sql("USE DATABASE SP500_STOCK_DEMO").collect()
session.sql("USE SCHEMA DATA").collect()

# Use enriched features (V2 content in PRICE_FEATURES)



In [None]:
# 1) Define base dataset and time windows
hourly = session.table('PRICE_FEATURES')
win_order = Window.partition_by('TICKER').order_by(col('TS'))
horizon_hours = 378

ds = (
    hourly
    .with_column('FUT_CLOSE', lead(col('CLOSE'), horizon_hours).over(win_order))
    .with_column('TARGET_PCT_3M', (col('FUT_CLOSE')/col('CLOSE') - 1))
    .drop('FUT_CLOSE')
    .filter(col('TARGET_PCT_3M').is_not_null())
)

# Old split used in V_1
cutoff_old = session.sql("select dateadd('day', -30, max(TS)) as c from PRICE_FEATURES").collect()[0]['C']
train_old = ds.filter(col('TS') < cutoff_old)
test_old = ds.filter(col('TS') >= cutoff_old)

# New split: shift the window forward by 15 days
cutoff_new = session.sql("select dateadd('day', -15, max(TS)) as c from PRICE_FEATURES").collect()[0]['C']
train_new = ds.filter(col('TS') < cutoff_new)
test_new = ds.filter(col('TS') >= cutoff_new)

print({'cutoff_old': cutoff_old, 'cutoff_new': cutoff_new})


In [None]:
# 2) Retrain with fixed XGB params
feature_cols = ['RET_1','SMA_5','SMA_20','VOL_20','RSI_PROXY','VOLUME','CLOSE']
label_col = 'TARGET_PCT_3M'
output_col = 'PREDICTED_RETURN'

xgb = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    input_cols=feature_cols,
    label_cols=[label_col],
    output_cols=[output_col],
    random_state=42,
)

pipe = Pipeline(steps=[('xgb', xgb)])
model_new = pipe.fit(train_new)
print('Retrained.')


In [None]:
# 3) Evaluate and compare with previous registered version
from snowflake.snowpark.functions import avg

def compute_metrics(df, y_col, yhat_col):
    rmse = df.select(sqrt(avg(sp_pow(col(y_col) - col(yhat_col), 2))).alias('rmse')).collect()[0]['RMSE']
    mean_y = df.select(avg(col(y_col)).alias('mean_y')).collect()[0]['MEAN_Y']
    sse = df.select(avg(sp_pow(col(y_col) - col(yhat_col), 2)).alias('mse')).collect()[0]['MSE']
    sst = df.select(avg(sp_pow(col(y_col) - mean_y, 2)).alias('var')).collect()[0]['VAR']
    r2 = 1.0 - (sse / sst if sst and sst != 0 else 0.0)
    return {'rmse': rmse, 'r2': r2}

pred_old = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA').get_model('XGB_SP500_RET3M').version('V_1').run(test_old, function_name='PREDICT')
metrics_old = compute_metrics(pred_old.select(label_col, output_col), label_col, output_col)

pred_new = model_new.predict(test_new)
metrics_new = compute_metrics(pred_new.select(label_col, output_col), label_col, output_col)
print({'old': metrics_old, 'new': metrics_new})


In [None]:
# 4) Register new version if improved
reg = Registry(session=session, database_name='SP500_STOCK_DEMO', schema_name='DATA')
model_name = 'XGB_SP500_RET3M'

if metrics_new['rmse'] < metrics_old['rmse']:
    models_df = reg.show_models()
    if models_df.empty or model_name not in models_df['name'].to_list():
        version = 'V_1'
    else:
        import ast, builtins
        max_v = builtins.max([int(v.split('_')[-1]) for v in ast.literal_eval(models_df.loc[models_df['name']==model_name,'versions'].values[0])])
        version = f'V_{max_v+1}'
    mv = reg.log_model(
        model_new,
        model_name=model_name,
        version_name=version,
        conda_dependencies=['snowflake-ml-python'],
        comment='Incremental retrain (shifted window)',
        metrics={'rmse': metrics_new['rmse'], 'r2': metrics_new['r2']},
        options={'relax_version': False}
    )
    reg.get_model(model_name).default = version
    print({'registered': version})
else:
    print('No registration: new model did not improve RMSE')
