# Training Notebook (Model 1)

Trains XGBoost model for final_total from wk1_total and saves artifacts to artifacts/final_total.

In [None]:
# 1. Train Model 1 (final_total from wk1_total)
import os
import math
import sys
import shutil
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

PARENT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PARENT_DIR not in sys.path:
    sys.path.insert(0, PARENT_DIR)

RANDOM_STATE = 42
EPS = 1.0
DB_PATH = "C:\\Users\\Nebula PC\\Swinburne\\Intern\\boxai\\data\\numero.duckdb"
TABLE_SQL = "SELECT wk1_total, final_total FROM features_afterN1_total"

# ---- Metric ----
def wape(y_true, y_pred):
    denom = np.sum(np.abs(y_true)) + 1e-9
    return float(np.sum(np.abs(y_true - y_pred) / denom)) if denom > 0 else np.nan

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred))
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / np.maximum(denom, 1e-9)))

def mape_eps(y_true, y_pred, eps=EPS):
    return float(np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), eps))))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def r2(y_true, y_pred):
    return float(r2_score(y_true, y_pred))

# ---- Data Load  ----
import duckdb as ddb
with ddb.connect(DB_PATH, read_only=True) as con:
    df = con.execute(TABLE_SQL).df()
if df.empty:
    raise ValueError('Query returned 0 rows.')
print(f'Loaded {len(df)} rows from DuckDB.')
print('Data sample:\n', df.head(3))

# ---- Split & Transform ----
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
Xtr = np.log1p(train_df[['wk1_total']].values.astype('float64'))
Ytr = np.log1p(train_df['final_total'].values.astype('float64'))
Xte = np.log1p(test_df[['wk1_total']].values.astype('float64'))
Yte = test_df['final_total'].values.astype('float64')

Xtr_sub, Xval, Ytr_sub, Yval = train_test_split(Xtr, Ytr, test_size=0.2, random_state=RANDOM_STATE)

dtrain = xgb.DMatrix(Xtr_sub, label=Ytr_sub)
dval = xgb.DMatrix(Xval, label=Yval)
dtest = xgb.DMatrix(Xte)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 3,
    'eta': 0.05,
    'subsample': 0.9,
    'colsample_bytree': 1.0,
    'lambda': 1.0,
    'tree_method': 'hist',
    'seed': RANDOM_STATE,
}

bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=800,
    evals=[(dval, 'validation')],
    early_stopping_rounds=50,
    verbose_eval=False
)

if hasattr(bst, 'best_ntree_limit') and bst.best_ntree_limit:
    pred_te = np.expm1(bst.predict(dtest, ntree_limit=bst.best_ntree_limit))
elif hasattr(bst, 'best_iteration') and bst.best_iteration is not None:
    pred_te = np.expm1(bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1)))
else:
    pred_te = np.expm1(bst.predict(dtest))

# ---- Metrics ----
metrics = {
    'MAPE_eps1': mape_eps(Yte, pred_te, eps=1.0),
    'R2': r2(Yte, pred_te),
    'WAPE': wape(Yte, pred_te),
    'SMAPE': smape(Yte, pred_te),
    'MAE': mae(Yte, pred_te),
    'RMSE': rmse(Yte, pred_te)
}
print('Model 1 (XGB, test) metrics:')
for k_, v_ in metrics.items():
    print(f'- {k_}: {v_:.4f}')

# ---- Save Artifacts ----
from boxai.models.final_total_predictor import FinalTotalPredictor
ART_DIR = '../artifacts/final_total'
os.makedirs(ART_DIR, exist_ok=True)

best_ntree_limit = getattr(bst, 'best_ntree_limit', None)
best_iteration = getattr(bst, 'best_iteration', None)
FinalTotalPredictor.save_from_training(
    booster=bst,
    artifacts_dir=ART_DIR,
    best_ntree_limit=best_ntree_limit,
    best_iteration=best_iteration,
    metrics=metrics,
    overwrite=True
)
model_path = os.path.join(ART_DIR, 'model.booster.json')
print(f'Artifacts saved to {ART_DIR}. Booster file exists: {os.path.isfile(model_path)} size={os.path.getsize(model_path) if os.path.isfile(model_path) else 0} bytes')


Loaded 1469 rows from DuckDB.
Data sample:
     wk1_total  final_total
0   2962981.0    3204876.0
1  11268323.0   19806027.0
2  22051768.0   40301616.0
Model 1 (XGB, test) metrics:
- MAPE_eps1: 0.3229
- R2: 0.8204
- WAPE: 0.3191
- SMAPE: 0.3138
- MAE: 47109513.7283
- RMSE: 199400512.8425
Artifacts saved to ../artifacts/final_total. Booster file exists: True size=127481 bytes


In [None]:
# (COMMENTED OUT) MODEL 2 TEMPLATE
"""
# =====================
# (COMMENTED OUT) MODEL 2 TEMPLATE: wk2_weekly from wk1_total
# Uncomment and adapt when ready to train second model.
"""
"""""
import xgboost as xgb
from sklearn.model_selection import train_test_split as tts
import numpy as np
import duckdb as ddb

with ddb.connect(DB_PATH, read_only=True) as con:
    df2 = con.execute('SELECT wk1_total, wk2_weekly FROM features_afterN1_week2').df()

train_df2, test_df2 = train_test_split(df2, test_size=0.2, random_state=RANDOM_STATE)

Xtr2 = np.log1p(train_df2[['wk1_total']].values.astype('float64'))
Ytr2 = np.log1p(train_df2['wk2_weekly'].values.astype('float64'))
Xte2 = np.log1p(test_df2[['wk1_total']].values.astype('float64'))
Yte2 = test_df2['wk2_weekly'].values.astype('float64')

Xtr2_sub, Xval2, Ytr2_sub, Yval2 = tts(Xtr2, Ytr2, test_size=0.2, random_state=RANDOM_STATE)

dtrain2 = xgb.DMatrix(Xtr2_sub, label=Ytr2_sub)
dval2 = xgb.DMatrix(Xval2, label=Yval2)
dtest2 = xgb.DMatrix(Xte2)

params2 = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 3,
    'eta': 0.05,
    'subsample': 0.9,
    'colsample_bytree': 1.0,
    'lambda': 1.0,
    'tree_method': 'hist',
    'seed': RANDOM_STATE,
}

bst2 = xgb.train(
    params=params2,
    dtrain=dtrain2,
    num_boost_round=800,
    evals=[(dval2, 'validation')],
    early_stopping_rounds=50,
    verbose_eval=False
)

if hasattr(bst2, 'best_ntree_limit') and bst2.best_ntree_limit:
    pred_te2 = np.expm1(bst2.predict(dtest2, ntree_limit=bst2.best_ntree_limit))
elif hasattr(bst2, 'best_iteration') and bst2.best_iteration is not None:
    pred_te2 = np.expm1(bst2.predict(dtest2, iteration_range=(0, bst2.best_iteration + 1)))
else:
    pred_te2 = np.expm1(bst2.predict(dtest2))

metrics2 = {
    'MAPE_eps1': mape_eps(Yte2, pred_te2, eps=1.0),
    'R2': r2(Yte2, pred_te2),
    'WAPE': wape(Yte2, pred_te2),
    'SMAPE': smape(Yte2, pred_te2),
    'MAE': mae(Yte2, pred_te2),
    'RMSE': rmse(Yte2, pred_te2)
}

print('Model 2 (XGB, test) metrics:')
for k_, v_ in metrics2.items():
    print(f"- {k_}: {v_:.4f}")
"""
# =====================
