# Notebook 6 â€” Forecasting and Deployment

Purpose:
- Train final model(s) on full training data or retrain using best params
- Implement multi-step forecasting: recursive (iterative) and direct (example for H months)
- Provide an inference helper that, given last observed months for a region, produces an H-month forecast
- Save final forecasts and model artifacts to ../4_data_analysis/model_datasets/

Notes:
- This notebook expects model artifacts (final model) or best_params.json created by previous notebooks. It also reads the FE dataset for context.
- Multi-step forecasting uses generated lag/roll features; the helper carefully updates lag/rolling features as it forecasts forward.

In [None]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

out_dir = os.path.join('..','4_data_analysis','model_datasets')
fe_path = os.path.join(out_dir, 'model_ready_dataset_fe.csv')
model_path = os.path.join(out_dir, 'final_rf_model_joblib.pkl')
best_params_path = os.path.join(out_dir, 'best_params.json')
os.makedirs(out_dir, exist_ok=True)

if not os.path.exists(fe_path):
    raise FileNotFoundError('FE dataset missing; run Notebook 3')
df = pd.read_csv(fe_path)
print('Loaded FE dataset shape:', df.shape)


Load or train final model
- If final_rf_model_joblib.pkl exists use it; otherwise retrain on all available data (excluding last 24 months per region) or on entire series depending on your preference for forecasting horizon.

In [None]:
# If a saved model exists use it; else retrain using best_params or defaults
if os.path.exists(model_path):
    print('Loading saved model from', model_path)
    model = joblib.load(model_path)
else:
    print('Saved model not found. Training a new RF on full data (excluding last 24 months per region)')
    # split out last 24 months per region as holdout
    def train_test_time_split(df, group_col='REGION', time_col='Time', test_periods=24):
        train_parts, test_parts = [], []
        for name, g in df.groupby(group_col):
            g_sorted = g.sort_values(time_col).reset_index(drop=True)
            train_parts.append(g_sorted.iloc[:-test_periods].copy())
            test_parts.append(g_sorted.iloc[-test_periods:].copy())
        return pd.concat(train_parts).reset_index(drop=True), pd.concat(test_parts).reset_index(drop=True)
    train_df, test_df = train_test_time_split(df, test_periods=24)
    features = [c for c in df.columns if c not in ['REGION','YEAR','Month','Month_Num','Rainfall','Temperature','Time']]
    features = ['YEAR','Time','Month_sin','Month_cos'] + [c for c in features if ('lag' in c or '_roll' in c)]
    features = [c for c in features if c in train_df.columns]
    X_tr = train_df[features]
    y_tr = train_df['Rainfall']
    # use best params if available
    if os.path.exists(best_params_path):
        import json
        with open(best_params_path) as f:
            best = json.load(f)
        rf_params = best.get('rf_raw') or best.get('rf_log1p') or {'n_estimators':200, 'max_depth':10}
    else:
        rf_params = {'n_estimators':200, 'max_depth':10}
    rf_params = {k:v for k,v in rf_params.items() if k in ['n_estimators','max_depth','max_features']}
    model = RandomForestRegressor(n_jobs=-1, random_state=42, **rf_params)
    model.fit(X_tr, y_tr)
    joblib.dump(model, model_path)
    print('Trained and saved model to', model_path)


Multi-step forecasting helper (recursive):
- Input: last observed rows for a single region (must include features and last observed Rainfall/Temperature)
- The helper iteratively constructs new rows for each forecast step, updating lag and rolling columns as it goes.
- Example uses H=6 months; adjust as needed.

In [None]:
def recursive_forecast_region(model, region_df, H=6, features=None, LAGS=[1,2,3,12], roll_windows={'roll3':3,'roll12':12}):
    # region_df should be sorted by Time ascending and contain the latest observed row at the end
    res_rows = []
    cur = region_df.copy().reset_index(drop=True)
    last_time = cur['Time'].iloc[-1]
    for h in range(1, H+1):
        new_time = last_time + h
        # Build new row dict starting from last observed features that are static/known (REGION, YEAR increment, Month_Num rotate)
        last_row = cur.iloc[-1].to_dict()
        new_row = last_row.copy()
        new_row['Time'] = new_time
        # increment month/year
        prev_month = int(last_row['Month_Num'])
        new_month = ((prev_month + h - 1) % 12) + 1
        # naive YEAR increment when month wraps (approx)
        year_inc = (prev_month + h - 1) // 12
        new_row['Month_Num'] = new_month
        new_row['YEAR'] = int(new_row['YEAR']) + year_inc
        # update cyclical
        new_row['Month_sin'] = np.sin(2 * np.pi * (new_row['Month_Num'] / 12))
        new_row['Month_cos'] = np.cos(2 * np.pi * (new_row['Month_Num'] / 12))

        # update lag columns from cur (use the previously observed and forecasted values in cur)
        for lag in LAGS:
            col = f'Rainfall_lag_{lag}'
            # lag 1 is previous month: take last row's Rainfall for lag 1 at h=1, else from cur tail
            idx = len(cur) - lag
            if idx >= 0:
                new_row[col] = cur.iloc[idx]['Rainfall']
            else:
                new_row[col] = np.nan
        # For Temperature lags if present, shift similarly
        for lag in LAGS:
            col = f'Temperature_lag_{lag}'
            idx = len(cur) - lag
            if idx >= 0:
                new_row[col] = cur.iloc[idx]['Temperature']
            else:
                new_row[col] = np.nan

        # Rolling features: recompute from cur (which grows as we append forecasts)
        for label, window in roll_windows.items():
            rcol = f'Rainfall_{label}'
            tcol = f'Temperature_{label}'
            # compute using shift(1).rolling(window).mean over the Rainfall series in cur
            s = pd.Series(cur['Rainfall'].values)
            # use only available previous values
            if len(s) >= window:
                val = s.shift(1).rolling(window=window, min_periods=window).mean().iloc[-1]
            else:
                val = np.nan
            new_row[rcol] = val
            sT = pd.Series(cur['Temperature'].values)
            if len(sT) >= window:
                valT = sT.shift(1).rolling(window=window, min_periods=window).mean().iloc[-1]
            else:
                valT = np.nan
            new_row[tcol] = valT

        # Create dataframe for prediction (features ordering)
        pred_df = pd.DataFrame([new_row])
        # ensure features list
        if features is None:
            feat_cols = [c for c in pred_df.columns if c not in ['REGION','YEAR','Month','Month_Num','Rainfall','Temperature','Time']]
        else:
            feat_cols = features
        feat_cols = [c for c in feat_cols if c in pred_df.columns]
        X_pred = pred_df[feat_cols]
        yhat = model.predict(X_pred)[0]
        yhat = max(0.0, yhat)  # clip
        new_row['Rainfall'] = yhat
        # Temperature forecast: naive carry-forward of last observed temperature if no model for temp available
        if 'Temperature' not in pred_df.columns or pd.isna(new_row.get('Temperature', np.nan)):
            new_row['Temperature'] = cur.iloc[-1]['Temperature']
        # append new row to cur so next iter uses updated history
        cur = cur.append(new_row, ignore_index=True)
        res_rows.append(new_row)
    return pd.DataFrame(res_rows)


Example: forecast next 6 months for one region and save results.

In [None]:
# Choose a region
region = df['REGION'].unique()[0]
print('Forecast example for region:', region)
reg_df = df[df['REGION']==region].sort_values('Time')
feat_cols = [c for c in df.columns if c not in ['REGION','YEAR','Month','Month_Num','Rainfall','Temperature','Time']]
feat_cols = ['YEAR','Time','Month_sin','Month_cos'] + [c for c in feat_cols if ('lag' in c or '_roll' in c)]
feat_cols = [c for c in feat_cols if c in df.columns]

fc = recursive_forecast_region(model, reg_df, H=6, features=feat_cols)
print('Forecast output:')
display(fc)
out_fc_path = os.path.join(out_dir, f'forecast_{region}_H6.csv')
fc.to_csv(out_fc_path, index=False)
print('Saved forecast to', out_fc_path)


Batch forecasts for all regions (H=6) and save a combined CSV.
This may take some time depending on number of regions; adjust H as needed.

In [None]:
all_fcs = []
for region in df['REGION'].unique():
    reg_df = df[df['REGION']==region].sort_values('Time')
    try:
        fc = recursive_forecast_region(model, reg_df, H=6, features=feat_cols)
        fc['REGION'] = region
        all_fcs.append(fc)
    except Exception as e:
        print('Failed to forecast region', region, 'error:', e)

if all_fcs:
    all_fc_df = pd.concat(all_fcs, ignore_index=True)
    
    all_fc_path = os.path.join(out_dir, 'forecasts_all_regions_H6.csv')
    all_fc_df.to_csv(all_fc_path, index=False)
    print('Saved combined forecasts to', all_fc_path)


Wrap-up and next steps
- The recursive helper demonstrates how to produce multi-step forecasts while updating lags and rolling features.
- For production: consider retraining final model on the entire available history (no holdout) before deploying forecasts; or use ensemble of models.
- To produce prediction intervals consider bootstrap residuals or use probabilistic learners (quantile regressor, XGB quantile, or use Tweedie objective for XGBoost).
- Save model(s) and forecast outputs with metadata (training period, model params) so stakeholders can reproduce results.