# Notebook 6 â€” Final Forecasting (Per-region Rainfall & Temperature)

This notebook loads per-region models (trained in Notebook 5), produces H-step forecasts per region (recursive), evaluates using R2 as the primary metric, and saves forecasts and diagnostics.

Assumptions / features:
- Models were saved per-region in ../4_data_analysis/model_datasets/models_by_region/ (file names: model_rain_{REGION}.joblib and model_temp_{REGION}.joblib)
- Feature-engineered input is ../4_data_analysis/model_datasets/model_ready_dataset_fe.csv
- Forecasting is recursive: at each step we predict both targets for the next month and append them to the series so next-step lags/rolls use predicted values.
- Primary evaluation metric: R2 (per-region and aggregated). MAE is also reported for context.


In [5]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt
from math import sqrt

fe_path = os.path.join('..','4_data_analysis','model_datasets','model_ready_dataset_fe.csv')
models_dir = os.path.join('..','4_data_analysis','model_datasets','models_by_region')
models_index_path = os.path.join(models_dir, 'models_index.json')
out_dir = os.path.join('..','4_data_analysis','model_datasets','forecasts_by_region')
os.makedirs(out_dir, exist_ok=True)

if not os.path.exists(fe_path):
    raise FileNotFoundError('Feature-engineered dataset not found. Run Notebook 3 first.')
df = pd.read_csv(fe_path)
if 'Time' not in df.columns:
    df = df.sort_values(['REGION','YEAR','Month_Num']).reset_index(drop=True)
    df['Time'] = df.groupby('REGION').cumcount()
print('Loaded FE dataset shape:', df.shape)


Loaded FE dataset shape: (2040, 21)


In [6]:
# Load models index if present
if os.path.exists(models_index_path):
    with open(models_index_path) as f:
        models_index = json.load(f)
    print('Loaded models index with', len(models_index), 'regions')
else:
    # build a fallback index by scanning files
    models_index = {}
    for fn in os.listdir(models_dir) if os.path.exists(models_dir) else []:
        if fn.startswith('model_rain_') and fn.endswith('.joblib'):
            region = fn[len('model_rain_'):-len('.joblib')]
            models_index.setdefault(region, {})['rain_model'] = os.path.join(models_dir, fn)
        if fn.startswith('model_temp_') and fn.endswith('.joblib'):
            region = fn[len('model_temp_'):-len('.joblib')]
            models_index.setdefault(region, {})['temp_model'] = os.path.join(models_dir, fn)
    print('Built fallback models index for', len(models_index), 'regions')


Built fallback models index for 0 regions


Helper: recursive forecast function for both targets using region-specific models.
- The function expects the region history (sorted by Time, with last obs at the end) and the loaded models (Rain and Temp).
- It updates lag and rolling features using both observed and forecasted values so predictions can be iterated H steps ahead.
- The function uses the ensured feature list that the per-region models were trained with (we attempt to infer compatible features automatically).


In [7]:
def recursive_forecast_region_both(model_rain, model_temp, region_df, H=6, features=None, LAGS=[1,2,3,12], roll_windows={'roll3':3,'roll12':12}):
    cur = region_df.copy().reset_index(drop=True)
    last_time = int(cur['Time'].iloc[-1])
    res_rows = []
    for h in range(1, H+1):
        new_time = last_time + h
        last_row = cur.iloc[-1].to_dict()
        new_row = last_row.copy()
        new_row['Time'] = new_time
        prev_month = int(last_row['Month_Num'])
        new_month = ((prev_month + h - 1) % 12) + 1
        year_inc = (prev_month + h - 1) // 12
        new_row['Month_Num'] = new_month
        new_row['YEAR'] = int(new_row['YEAR']) + year_inc
        new_row['Month_sin'] = np.sin(2 * np.pi * (new_row['Month_Num'] / 12))
        new_row['Month_cos'] = np.cos(2 * np.pi * (new_row['Month_Num'] / 12))
        # update lags
        for lag in LAGS:
            rcol = f'Rainfall_lag_{lag}'
            tcol = f'Temperature_lag_{lag}'
            idx = len(cur) - lag
            if idx >= 0:
                new_row[rcol] = cur.iloc[idx]['Rainfall']
                new_row[tcol] = cur.iloc[idx]['Temperature']
            else:
                new_row[rcol] = np.nan
                new_row[tcol] = np.nan
        # update rolling features
        for label, window in roll_windows.items():
            rcol = f'Rainfall_{label}'
            tcol = f'Temperature_{label}'
            s = pd.Series(cur['Rainfall'].values)
            if len(s) >= window:
                val = s.shift(1).rolling(window=window, min_periods=window).mean().iloc[-1]
            else:
                val = np.nan
            new_row[rcol] = val
            sT = pd.Series(cur['Temperature'].values)
            if len(sT) >= window:
                valT = sT.shift(1).rolling(window=window, min_periods=window).mean().iloc[-1]
            else:
                valT = np.nan
            new_row[tcol] = valT
        # prepare features to predict
        pred_df = pd.DataFrame([new_row])
        if features is None:
            feat_cols = [c for c in pred_df.columns if c not in ['REGION','YEAR','Month','Month_Num','Rainfall','Temperature','Time']]
        else:
            feat_cols = features
        feat_cols = [c for c in feat_cols if c in pred_df.columns]
        X_pred = pred_df[feat_cols]
        # align columns to model expectations (if the model lacks some cols, fillna)
        # predict rain and temp
        yhat_rain = model_rain.predict(X_pred)[0]
        yhat_temp = model_temp.predict(X_pred)[0]
        yhat_rain = float(max(0.0, yhat_rain))
        new_row['Rainfall'] = yhat_rain
        new_row['Temperature'] = float(yhat_temp)
        cur = pd.concat([cur, pd.DataFrame([new_row])], ignore_index=True)
        res_rows.append(new_row)
    return pd.DataFrame(res_rows)


Forecasting loop over regions: for each region load the saved models and run recursive forecasting for H months. The notebook also evaluates R2 on the holdout (last 24 months) if models were trained with the same split.

In [8]:
H = 6  # forecast horizon
all_forecasts = []
eval_rows = []
for region, meta in models_index.items():
    print('\nRegion:', region)
    rain_model_path = meta.get('rain_model')
    temp_model_path = meta.get('temp_model')
    if not rain_model_path or not temp_model_path:
        print('  Missing model for region, skipping')
        continue
    try:
        model_rain = joblib.load(rain_model_path)
        model_temp = joblib.load(temp_model_path)
    except Exception as e:
        print('  Failed to load models for region', region, 'error:', e)
        continue
    # region history
    reg_hist = df[df['REGION']==region].sort_values('Time').reset_index(drop=True)
    # For evaluation: if we want to compute R2 on holdout, use last 24 months
    if len(reg_hist) > 24:
        holdout = reg_hist.iloc[-24:].copy()
        train_hist = reg_hist.iloc[:-24].copy()
        # short one-step prediction evaluation using the saved model
        # prepare features (inferred)
        feature_candidates = [c for c in df.columns if c not in ['REGION','YEAR','Month','Month_Num','Rainfall','Temperature','Time']]
        base = [c for c in ['YEAR','Time','Month_sin','Month_cos'] if c in df.columns]
        lag_roll = [c for c in feature_candidates if ('lag' in c) or ('_roll' in c)]
        feat_list = base + lag_roll
        feat_list = [c for c in feat_list if c in train_hist.columns]
        if len(feat_list)==0:
            print('  No features found for evaluation, skipping region')
            continue
        X_hold = holdout[feat_list]
        y_hold_rain = holdout['Rainfall'].values
        y_hold_temp = holdout['Temperature'].values
        try:
            yhat_hold_rain = model_rain.predict(X_hold)
            yhat_hold_temp = model_temp.predict(X_hold)
        except Exception as e:
            print('  Model predict failed on holdout for region', region, 'error:', e)
            continue
        yhat_hold_rain = np.clip(yhat_hold_rain, 0, None)
        r2_rain = float(r2_score(y_hold_rain, yhat_hold_rain))
        r2_temp = float(r2_score(y_hold_temp, yhat_hold_temp))
        mae_rain = float(mean_absolute_error(y_hold_rain, yhat_hold_rain))
        mae_temp = float(mean_absolute_error(y_hold_temp, yhat_hold_temp))
    else:
        r2_rain = None; r2_temp=None; mae_rain=None; mae_temp=None
    # Forecast H months recursively using the full history (including last observed)
    try:
        fc = recursive_forecast_region_both(model_rain, model_temp, reg_hist, H=H, features=feat_list)
        fc['REGION'] = region
        all_forecasts.append(fc)
        
    except Exception as e:
        print('  Forecasting failed for region', region, 'error:', e)
        continue
    eval_rows.append({'REGION': region, 'holdout_r2_rain': r2_rain, 'holdout_r2_temp': r2_temp, 'holdout_mae_rain': mae_rain, 'holdout_mae_temp': mae_temp})
    # save region forecast
    out_path = os.path.join(out_dir, f'forecast_{region}_H{H}.csv')
    fc.to_csv(out_path, index=False)
    print('  Saved forecast to', out_path)

if all_forecasts:
    combined = pd.concat(all_forecasts, ignore_index=True)
    combined.to_csv(os.path.join(out_dir, f'forecasts_all_regions_H{H}.csv'), index=False)
    print('\nSaved combined forecasts for all regions to forecasts_all_regions_H{H}.csv')
if eval_rows:
    eval_df = pd.DataFrame(eval_rows)
    eval_df.to_csv(os.path.join(out_dir, 'per_region_holdout_eval.csv'), index=False)
    print('Saved per-region holdout evaluation (R2 primary) to per_region_holdout_eval.csv')


Summaries:
- The file per_region_holdout_eval.csv contains holdout R2 and MAE for Rainfall and Temperature (where available). R2 is the primary column to inspect.
- forecasts_all_regions_H{H}.csv contains the H-step forecasts for all regions.

Next suggestions:
- If some regions have very low or negative R2, inspect their time series with Notebook 5 plots and consider: more region-specific features, longer training windows, or simpler models for those regions.
- If you want prediction intervals, either use quantile regressors (LightGBM/Sklearn-Quantile) or bootstrap residuals on holdout to produce empirical intervals.
