In [1]:
import numpy as np
import pandas as pd

In [2]:
merged_df = pd.read_csv("Data/final_modeling_dataset.csv")
merged_df.head()

Unnamed: 0,periodname,week_start,week_end,Combined_positive,temp_c,rh_pct,rain_mm,wind10_kmh,soil_moisture_top_m3m3
0,Week 1 2018-12-31 - 2019-01-06,2018-12-31,2019-01-06,66.0,30.115476,38.166667,0.0,20.397024,0.027899
1,Week 2 2019-01-07 - 2019-01-13,2019-01-07,2019-01-13,110.0,30.48869,24.785714,0.0,20.902381,0.025518
2,Week 3 2019-01-14 - 2019-01-20,2019-01-14,2019-01-20,73.0,30.727381,31.047619,0.0,15.034524,0.021024
3,Week 4 2019-01-21 - 2019-01-27,2019-01-21,2019-01-27,70.0,31.342857,28.553571,0.0,19.12381,0.01925
4,Week 5 2019-01-28 - 2019-02-03,2019-01-28,2019-02-03,48.0,31.215476,28.142857,0.0,21.282738,0.021952


## Modelling

## 1) Setup

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge

from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

## 2) Parameters (no hard-coding of lags)

In [4]:
DATE_COL   = "week_start"
TARGET_COL = "Combined_positive"

EXOG_COLS = ["temp_c", "rh_pct", "rain_mm", "wind10_kmh", "soil_moisture_top_m3m3"]

# from your EDA:
best_lags = {
    "temp_c": [4],
    "rh_pct": [4],
    "rain_mm": [5],
    "wind10_kmh": [4],
    "soil_moisture_top_m3m3": [5]
}

# optional: include autoregressive lags of the target for stronger performance
# adjust or set [] to disable
ar_lags = [1, 2, 3, 4]

# test window length (holdout for final evaluation)
TEST_WEEKS = None  # set None to use 20% of data; or set an integer like 26

### 3) Utilities

#### 3.1 Make lagged features (exog + optional AR lags) without hard-coding

In [5]:
def make_lagged_features(df, date_col, target_col, exog_cols, lag_map, ar_lags=None, dropna=True):
    """
    Create lagged features dynamically from a lag_map, e.g. {"temp_c":[4], "rain_mm":[5], ...}
    Optionally add AR lags of the target: ar_lags=[1,2,3,...].
    """
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col])
    out = out.sort_values(date_col).reset_index(drop=True)

    feature_cols = []

    # exogenous lags
    for col in exog_cols:
        for L in lag_map.get(col, []):
            fname = f"{col}_lag{L}"
            out[fname] = out[col].shift(L)
            feature_cols.append(fname)

    # autoregressive lags of the target (optional)
    if ar_lags:
        for L in ar_lags:
            fname = f"{target_col}_lag{L}"
            out[fname] = out[target_col].shift(L)
            feature_cols.append(fname)

    if dropna:
        out = out.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

    return out, feature_cols

#### 3.2 Metrics

In [6]:
def evaluate(y_true, y_pred, prefix=""):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-9))) * 100.0
    r2   = r2_score(y_true, y_pred)
    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE%": mape,
        f"{prefix}R2": r2
    }

def print_metrics(d):
    for k,v in d.items():
        print(f"{k:>10}: {v:,.4f}")

#### 3.3 Time-series split helper (train/test)

In [7]:
def train_test_split_time(df, date_col, test_weeks=None, test_ratio=0.2):
    """
    Splits in time order: [TRAIN | TEST]
    - If test_weeks is None, uses test_ratio of data; else uses that many rows for TEST.
    """
    n = len(df)
    if test_weeks is None:
        n_test = int(np.ceil(n * test_ratio))
    else:
        n_test = int(test_weeks)

    n_train = n - n_test

    train = df.iloc[:n_train].copy()
    test = df.iloc[n_train:].copy()

    return train, test

### 4) Prepare dataset

In [8]:
df0 = merged_df.copy()  # your cleaned dataframe

df_lagged, FEATURE_COLS = make_lagged_features(
    df0, DATE_COL, TARGET_COL, EXOG_COLS, best_lags, ar_lags=ar_lags, dropna=True
)

# Now only train and test
train_df, test_df = train_test_split_time(df_lagged, DATE_COL, test_weeks=TEST_WEEKS)

X_train, y_train = train_df[FEATURE_COLS], train_df[TARGET_COL]
X_test, y_test   = test_df[FEATURE_COLS], test_df[TARGET_COL]

### 5) Baselines

#### 5.1 Naïve persistence (ŷ_t = y_{t-1})

In [10]:
# naive baseline using the last observed target
def naive_persistence(y_series):
    return y_series.shift(1)

# evaluate only on TEST using available histories
test_pred_naive = naive_persistence(
    pd.concat([train_df[TARGET_COL], test_df[TARGET_COL]])
).iloc[len(train_df):]

print("Baseline: Naïve persistence (TEST)")
print_metrics(evaluate(y_test.values, test_pred_naive.values, prefix="TEST_"))

Baseline: Naïve persistence (TEST)
  TEST_MAE: 842.1014
 TEST_RMSE: 1,151.1400
TEST_MAPE%: 39.5785
   TEST_R2: 0.6440


#### 5.2 Simple linear baseline (Ridge) with your lagged features

In [11]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0, fit_intercept=True)
ridge.fit(X_train, y_train)

# Predict only on TEST
test_pred_ridge = ridge.predict(X_test)

print("\nBaseline: Ridge with lagged exogenous + AR (TEST)")
print_metrics(evaluate(y_test.values, test_pred_ridge, prefix="TEST_"))


Baseline: Ridge with lagged exogenous + AR (TEST)
  TEST_MAE: 769.1845
 TEST_RMSE: 1,111.2751
TEST_MAPE%: 34.0553
   TEST_R2: 0.6682


### 6) XGBoost — default model

In [12]:
from xgboost import XGBRegressor

xgb_default = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
    reg_lambda=1.0
)

# Train only on TRAIN
xgb_default.fit(X_train, y_train)

# Predict only on TEST
test_pred_xgb_def = xgb_default.predict(X_test)

print("\nXGBoost (default) (TEST)")
print_metrics(evaluate(y_test.values, test_pred_xgb_def, prefix="TEST_"))


XGBoost (default) (TEST)
  TEST_MAE: 823.8133
 TEST_RMSE: 1,203.7502
TEST_MAPE%: 37.1308
   TEST_R2: 0.6107


### 7) XGBoost — hyperparameter tuning with time-series CV

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import numpy as np

# ----- Time-series CV -----
tscv = TimeSeriesSplit(n_splits=5)

# ----- Search space -----
param_dist = {
    "n_estimators":    np.arange(200, 1201, 100),
    "learning_rate":   np.linspace(0.01, 0.2, 20),
    "max_depth":       np.arange(2, 9),
    "min_child_weight":np.arange(1, 8),
    "subsample":       np.linspace(0.6, 1.0, 9),
    "colsample_bytree":np.linspace(0.6, 1.0, 9),
    "gamma":           np.linspace(0.0, 1.0, 11),
    "reg_alpha":       np.linspace(0.0, 1.0, 11),
    "reg_lambda":      np.linspace(0.5, 2.0, 16),
}

# ----- Base estimator -----
base = XGBRegressor(
    objective="reg:squarederror",
    n_jobs=-1,
    random_state=42,
)

# ----- Randomized search with time-series CV -----
search = RandomizedSearchCV(
    estimator=base,
    param_distributions=param_dist,
    n_iter=60,
    scoring="neg_root_mean_squared_error",
    cv=tscv,
    refit=True,         # retrain best model on full TRAIN
    verbose=0,
    random_state=42,
)

# ----- Run search -----
search.fit(X_train, y_train)

best_xgb = search.best_estimator_
print("Best CV params:", search.best_params_)
print("Best CV score (RMSE):", -search.best_score_)

# # ----- Final evaluation on TEST -----
# y_pred_test = best_xgb.predict(X_test)

# # If you have evaluate/print_metrics:
# # print_metrics(evaluate(y_test.values, y_pred_test, prefix="TEST_"))

# # Otherwise quick RMSE:
# rmse = np.sqrt(np.mean((y_test.values - y_pred_test) ** 2))
# print("TEST RMSE:", rmse)

Best CV params: {'subsample': 0.6, 'reg_lambda': 0.7, 'reg_alpha': 0.6000000000000001, 'n_estimators': 700, 'min_child_weight': 1, 'max_depth': 2, 'learning_rate': 0.02, 'gamma': 0.5, 'colsample_bytree': 0.7}
Best CV score (RMSE): 1242.3394457643612


*Validate tuned model and compare to baselines*

In [14]:
# best_xgb is the tuned model from RandomizedSearchCV.fit(X_train, y_train)

# Predict on TEST only
test_pred_xgb_tuned = best_xgb.predict(X_test)

print("\nXGBoost (tuned) (TEST)")
print_metrics(evaluate(y_test.values, test_pred_xgb_tuned, prefix="TEST_"))


XGBoost (tuned) (TEST)
  TEST_MAE: 838.2762
 TEST_RMSE: 1,196.8454
TEST_MAPE%: 38.2989
   TEST_R2: 0.6152


### 8) Refit on TRAIN+VALID, then forecast next 5 weeks

In [15]:
# Refit on all TRAIN data (no VALID anymore)
train_full = train_df  

X_full, y_full = train_full[FEATURE_COLS], train_full[TARGET_COL]
best_xgb.fit(X_full, y_full)


#### 8.2 Iterative forecaster

In [16]:
def iterative_forecast_n_weeks(model, df_original, date_col, target_col, exog_cols, lag_map, ar_lags, n_weeks=5, future_exog_df=None):
    """
    Iteratively predict n_weeks ahead using lagged exogenous + AR lags of target.
    - If some lags require future exog, supply future_exog_df with columns exog_cols and the next n_weeks dates.
    - Otherwise we'll carry-forward the last observed exog for any missing future needs.
    Returns a dataframe with future dates, predictions, and outbreak flags.
    """
    df = df_original.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)

    # we'll grow this frame step by step
    work = df[[date_col, target_col] + exog_cols].copy()

    # prepare future dates
    last_date = work[date_col].iloc[-1]
    future_dates = [last_date + pd.Timedelta(days=7*(i+1)) for i in range(n_weeks)]

    # optional exog for future (0-lag); we'll only use them if a lag would reference the future
    if future_exog_df is not None:
        fut = future_exog_df.copy()
        fut[date_col] = pd.to_datetime(fut[date_col])
        fut = fut.set_index(date_col).reindex(future_dates).ffill().bfill().reset_index()
        fut.rename(columns={"index": date_col}, inplace=True)
    else:
        # make a carry-forward frame using last known exog values
        last_vals = work[exog_cols].iloc[-1:].reset_index(drop=True)
        fut = pd.concat([last_vals] * n_weeks, ignore_index=True)
        fut.insert(0, date_col, future_dates)

    # container for predictions
    preds = []

    # make a list of all feature names once
    def feature_names():
        names = []
        for c in exog_cols:
            for L in lag_map.get(c, []):
                names.append(f"{c}_lag{L}")
        if ar_lags:
            for L in ar_lags:
                names.append(f"{target_col}_lag{L}")
        return names

    FCOLS = feature_names()

    # roll forward one week at a time
    cur = work.copy()
    for i in range(n_weeks):
        # append a new row with date and (if needed) 0-lag exog provided
        new_date = future_dates[i]
        new_row = {date_col: new_date}

        # we do NOT use 0-lag exog directly as features (only lags), but we keep them in cur so that
        # when we shift, the future lags can reference them if needed at longer horizons.
        for c in exog_cols:
            if new_date in fut[date_col].values:
                new_row[c] = fut.loc[fut[date_col] == new_date, c].values[0]
            else:
                new_row[c] = cur[c].iloc[-1]  # carry-forward fallback

        # target is unknown now; will be filled with prediction later if AR lags are used
        new_row[target_col] = np.nan
        cur = pd.concat([cur, pd.DataFrame([new_row])], ignore_index=True)

        # build lagged features on the fly for the newest row only
        tmp, _ = make_lagged_features(cur, date_col, target_col, exog_cols, lag_map, ar_lags=ar_lags, dropna=False)

        # take the last row (the one to predict) with complete features
        xrow = tmp[FCOLS].iloc[[-1]].copy()

        # if any lagged feature is NaN (happens early), backfill with last known value as a strict fallback
        xrow = xrow.fillna(method="ffill", axis=1).fillna(method="bfill", axis=1)

        # predict next week's target
        yhat = model.predict(xrow)[0]
        preds.append(yhat)

        # insert the prediction so AR lags can use it in subsequent steps
        cur.loc[cur.index[-1], target_col] = yhat

    # compute outbreak flags based on ratio rule
    preds = np.array(preds)
    ratios = np.r_[np.nan, preds[1:] / np.maximum(preds[:-1], 1e-9)]
    outbreak = np.r_[False, ratios[1:] > 1.5]  # first week has no prior, mark False

    out = pd.DataFrame({
        date_col: future_dates,
        "predicted_cases": preds,
        "ratio_to_prev_week": ratios,
        "is_outbreak": outbreak
    })
    return out

#### 8.3 Generate 5-week forecasts & outbreak flags

In [17]:
# Optionally, assemble a future exog frame (if you actually *know* next 5 weeks of exog)
# Otherwise, we rely on carry-forward above.
# future_exog_df = pd.DataFrame({
#     DATE_COL: pd.date_range(df0[DATE_COL].max() + pd.Timedelta(days=7), periods=5, freq="7D"),
#     "temp_c": ..., "rh_pct": ..., "rain_mm": ..., "wind10_kmh": ..., "soil_moisture_top_m3m3": ...
# })

forecast_df = iterative_forecast_n_weeks(
    best_xgb,
    df0[[DATE_COL, TARGET_COL] + EXOG_COLS].copy(),
    DATE_COL, TARGET_COL,
    EXOG_COLS, best_lags, ar_lags,
    n_weeks=5,
    future_exog_df=None  # or your future_exog_df if you have it
)

print("\nFive-week forecast & outbreak flags:")
print(forecast_df)


Five-week forecast & outbreak flags:
  week_start  predicted_cases  ratio_to_prev_week  is_outbreak
0 2025-09-08      3808.371094                 NaN        False
1 2025-09-15      5734.681152            1.505809         True
2 2025-09-22      5140.295410            0.896352        False
3 2025-09-29      4524.448730            0.880192        False
4 2025-10-06      4756.487305            1.051286        False
