# Notebook 6 — Final RandomForest Models (Region-aware) and Forecast to 2030

This notebook trains two RandomForest models (Rainfall and Temperature) in a region-aware way (REGION one-hot), saves holdout predictions, and then runs a recursive per-region forecasting procedure to produce monthly forecasts up to Dec 2030. The recursive forecasting retrains per-region models on each region's full history and steps forward month-by-month, rebuilding lag/rolling features using actuals and previous predictions.

Outputs:
- ../4_data_analysis/model_datasets/final_forecasts.csv (holdout predictions)
- ../4_data_analysis/model_datasets/per_region_final_metrics_rf.csv
- ../4_data_analysis/model_datasets/models/rf_rain.joblib and rf_temp.joblib (global RFs)
- ../4_data_analysis/model_datasets/models/rf_rain_<region>.joblib and rf_temp_<region>.joblib (per-region models used for 2030 forecasts)
- ../4_data_analysis/model_datasets/final_forecasts_to_2030.csv (recursive forecasts to 2030)


In [1]:
import os
import re
import json
import joblib
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
base_dir = os.path.join('..','4_data_analysis','model_datasets')
fe_path = os.path.join(base_dir, 'model_ready_dataset_fe.csv')
params_path = os.path.join(base_dir, 'best_params.json')
models_dir = os.path.join(base_dir, 'models')
os.makedirs(models_dir, exist_ok=True)


In [2]:
if not os.path.exists(fe_path):
    raise FileNotFoundError(f'Feature-engineered dataset not found at {fe_path}')
df = pd.read_csv(fe_path)
print('Loaded FE data shape:', df.shape)

best_params = {}
if os.path.exists(params_path):
    with open(params_path) as f:
        best_params = json.load(f)
print('Loaded best params keys:', list(best_params.keys()))


Loaded FE data shape: (2040, 21)
Loaded best params keys: ['ridge_raw', 'ridge_log1p', 'rf_raw', 'xgb_log1p']


In [3]:
# Ensure Time column exists and per-region ordering (same as Notebook 5)
if 'Time' not in df.columns:
    df = df.sort_values(['REGION','YEAR','Month_Num']).reset_index(drop=True)
    df['Time'] = df.groupby('REGION').cumcount()
print('Per-region counts sample:')
print(df.groupby('REGION').size().head())


Per-region counts sample:
REGION
Central    408
East       408
North      408
South      408
West       408
dtype: int64


In [4]:
def train_test_time_split(df, group_col='REGION', time_col='Time', test_periods=30):
    train_parts, test_parts = [], []
    for name, g in df.groupby(group_col):
        g_sorted = g.sort_values(time_col).reset_index(drop=True)
        if len(g_sorted) <= test_periods:
            raise ValueError(f"Region {name} has <= {test_periods} rows; reduce test_periods or drop region")
        train_parts.append(g_sorted.iloc[:-test_periods].copy())
        test_parts.append(g_sorted.iloc[-test_periods:].copy())
    return pd.concat(train_parts).reset_index(drop=True), pd.concat(test_parts).reset_index(drop=True)

train_df, test_df = train_test_time_split(df, test_periods=30)
print('Train shape:', train_df.shape, 'Test shape:', test_df.shape)


Train shape: (1890, 21) Test shape: (150, 21)


Encode REGION (one-hot) to let the global RFs learn region-specific effects. Build dummies on the concatenated train+test to guarantee consistent columns.

In [5]:
# Keep original REGION column for final output
test_region_series = test_df['REGION'].reset_index(drop=True)

# Concatenate and get dummies for REGION
combined = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
combined_d = pd.get_dummies(combined, columns=['REGION'], prefix='REG')

train_d = combined_d.iloc[:len(train_df)].reset_index(drop=True).copy()
test_d = combined_d.iloc[len(train_df):].reset_index(drop=True).copy()
print('After get_dummies, train_d shape:', train_d.shape, 'test_d shape:', test_d.shape)


After get_dummies, train_d shape: (1890, 25) test_d shape: (150, 25)


In [6]:
# Build feature list: exclude identifiers and targets; include REGION dummies
exclude = ['YEAR','Month','Month_Num','Rainfall','Temperature','Time']
# keep YEAR and Time as features explicitly
features = [c for c in train_d.columns if c not in exclude]
print('Feature count:', len(features))


Feature count: 19


Helper metrics

In [7]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred,)
def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)
def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)
def metrics(y_true, y_pred):
    return {'rmse': rmse(y_true, y_pred), 'mae': mae(y_true, y_pred), 'r2': r2(y_true, y_pred)}


In [8]:
def rf_from_params(params_source):
    params = {} if params_source is None else params_source.copy()
    rf_kwargs = {}
    rf_kwargs['n_estimators'] = int(params.get('n_estimators', 200))
    rf_kwargs['max_depth'] = None if params.get('max_depth') in [None,'None'] else params.get('max_depth')
    rf_kwargs['max_features'] = params.get('max_features', 'auto')
    rf_kwargs['random_state'] = 42
    rf_kwargs['n_jobs'] = -1
    return RandomForestRegressor(**rf_kwargs)

# choose RF params if present
rf_params = None
if 'rf_raw' in best_params:
    rf_params = best_params['rf_raw']
elif 'rf_log1p' in best_params:
    rf_params = best_params['rf_log1p']
elif 'rf' in best_params:
    rf_params = best_params['rf']
print('RF params source:', 'provided' if rf_params is not None else 'default')


RF params source: provided


In [9]:
# Prepare training matrices
X_tr = train_d[features]
X_te = test_d[features]

y_tr_rain = train_d['Rainfall'].values
y_te_rain = test_d['Rainfall'].values

y_tr_temp = train_d['Temperature'].values
y_te_temp = test_d['Temperature'].values

# Train RF for Rainfall (global, region-dummy aware)
rf_rain = rf_from_params(rf_params)
rf_rain.fit(X_tr, y_tr_rain)
pred_rain = rf_rain.predict(X_te)
pred_rain = np.clip(pred_rain, 0, None)

# Train RF for Temperature (global, region-dummy aware)
rf_temp = rf_from_params(rf_params)
rf_temp.fit(X_tr, y_tr_temp)
pred_temp = rf_temp.predict(X_te)

results = {
    'rain': metrics(y_te_rain, pred_rain),
    'temp': metrics(y_te_temp, pred_temp)
}
print('Holdout metrics (global):')
pprint(results)


Holdout metrics (global):
{'rain': {'mae': 0.5006030466197376,
          'r2': 0.8474583208914142,
          'rmse': 0.8835635569652258},
 'temp': {'mae': 0.8658784768450096,
          'r2': 0.9312898684634472,
          'rmse': 1.2051059439324308}}


In [10]:
# Build final output including REGION and DATE for the holdout predictions
test_out = test_df.reset_index(drop=True).copy()
test_out['Predicted_Rainfall'] = pred_rain
test_out['Predicted_Temperature'] = pred_temp

# Build DATE column using YEAR and Month_Num if present, else use Month
if 'Month_Num' in test_out.columns:
    month_col = 'Month_Num'
else:
    month_col = 'Month'

def build_date(row):
    try:
        m = int(row[month_col])
    except Exception:
        try:
            m = pd.to_datetime(str(row[month_col]), format='%b').month
        except Exception:
            try:
                m = pd.to_datetime(str(row[month_col]), format='%B').month
            except Exception:
                m = 1
    return pd.Timestamp(year=int(row['YEAR']), month=int(m), day=1)

test_out['DATE'] = test_out.apply(build_date, axis=1)

final_forecasts = test_out[['DATE','REGION','Predicted_Rainfall','Predicted_Temperature']].copy()
final_forecasts = final_forecasts.sort_values(['REGION','DATE']).reset_index(drop=True)

out_path = os.path.join(base_dir, 'final_forecasts.csv')
final_forecasts.to_csv(out_path, index=False)
print('Saved final (holdout) forecasts to', out_path)


Saved final (holdout) forecasts to ..\4_data_analysis\model_datasets\final_forecasts.csv


In [11]:
# Save per-region metrics for holdout
rows = []
for region, g in test_out.groupby('REGION'):
    rows.append({'REGION': region,
                'rain_rmse': rmse(g['Rainfall'], g['Predicted_Rainfall']),
                'rain_mae': mae(g['Rainfall'], g['Predicted_Rainfall']),
                'rain_r2': r2(g['Rainfall'], g['Predicted_Rainfall']),
                'temp_rmse': rmse(g['Temperature'], g['Predicted_Temperature']),
                'temp_mae': mae(g['Temperature'], g['Predicted_Temperature']),
                'temp_r2': r2(g['Temperature'], g['Predicted_Temperature']),
                'n': len(g)})
per_region_metrics = pd.DataFrame(rows).sort_values('REGION')
per_region_metrics.to_csv(os.path.join(base_dir, 'per_region_final_metrics_rf.csv'), index=False)
print('Saved per-region metrics to', os.path.join(base_dir, 'per_region_final_metrics_rf.csv'))

Saved per-region metrics to ..\4_data_analysis\model_datasets\per_region_final_metrics_rf.csv


In [12]:
# Ensure directory exists
os.makedirs(models_dir, exist_ok=True)

# Save models
rf_rain_path = os.path.join(models_dir, 'rf_rain.joblib')
rf_temp_path = os.path.join(models_dir, 'rf_temp.joblib')

joblib.dump(rf_rain, rf_rain_path)
joblib.dump(rf_temp, rf_temp_path)

print('Saved RF models to:', rf_rain_path, rf_temp_path)



Saved RF models to: ..\4_data_analysis\model_datasets\models\rf_rain.joblib ..\4_data_analysis\model_datasets\models\rf_temp.joblib


Forecast to 2030 — per-region recursive forecasting.

The following cell retrains per-region RandomForest models on each region's full history and performs iterative forecasts month-by-month until Dec 2030. It uses lag/rolling columns heuristically detected from the FE dataset (columns containing 'lag' or 'roll' patterns). The exact code from your requested snippet is integrated below.

In [13]:
# Forecast to 2030 (per-region recursive forecasting)
from datetime import datetime

df_full = df.copy()
# Ensure Time exists and rows are ordered per region
if 'Time' not in df_full.columns:
    df_full = df_full.sort_values(['REGION','YEAR','Month_Num']).reset_index(drop=True)
    df_full['Time'] = df_full.groupby('REGION').cumcount()

def find_feature_groups(cols, prefix):
    # Finds columns containing prefix and "lag" or "roll"
    lag_cols = [c for c in cols if re.search(rf'^{prefix}.*lag', c, flags=re.I)]
    roll_cols = [c for c in cols if re.search(rf'^{prefix}.*roll|{prefix}.*rolling', c, flags=re.I)]
    return sorted(lag_cols), sorted(roll_cols)

all_cols = df_full.columns.tolist()
rain_lags, rain_rolls = find_feature_groups(all_cols, 'Rainfall')
temp_lags, temp_rolls = find_feature_groups(all_cols, 'Temperature')

exclude = ['YEAR','Month','Month_Num','Rainfall','Temperature','Time','REGION']
base_features = [c for c in df_full.columns if c not in exclude and c not in [*rain_lags, *rain_rolls, *temp_lags, *temp_rolls]]
print('Base features (kept static):', base_features)
print('Rainfall lag cols found:', rain_lags)
print('Rainfall roll cols found:', rain_rolls)
print('Temperature lag cols found:', temp_lags)
print('Temperature roll cols found:', temp_rolls)

def next_month(year, month):
    month += 1
    if month > 12:
        month = 1
        year += 1
    return year, month

end_year, end_month = 2030, 12

final_rows = []
for region, g in df_full.groupby('REGION'):
    g = g.sort_values(['YEAR','Month_Num']).reset_index(drop=True)
    # Build feature_cols list for this region based on available columns
    feature_cols = base_features + rain_lags + rain_rolls + temp_lags + temp_rolls
    feature_cols = [c for c in feature_cols if c in g.columns]

    # Train per-region models on full history
    X_full = g[feature_cols]
    y_rain = g['Rainfall'].values
    y_temp = g['Temperature'].values

    rf_rain_reg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
    rf_temp_reg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
    rf_rain_reg.fit(X_full, y_rain)
    rf_temp_reg.fit(X_full, y_temp)

    # Save per-region models
    rn = re.sub(r'\W+','_', region)
    prain_path = os.path.join(models_dir, f'rf_rain_{rn}.joblib')
    ptemp_path = os.path.join(models_dir, f'rf_temp_{rn}.joblib')
    joblib.dump(rf_rain_reg, prain_path)
    joblib.dump(rf_temp_reg, ptemp_path)
    print(f'Saved per-region models for {region} ->', prain_path, ptemp_path)

    # Prepare history for recursive forecasting (use actuals + future predicted appended)
    history = g.copy()
    last_year, last_month = int(history.iloc[-1]['YEAR']), int(history.iloc[-1]['Month_Num'])
    y, m = next_month(last_year, last_month)

    future_dates = []
    while (y < end_year) or (y == end_year and m <= end_month):
        future_dates.append((y, m))
        y, m = next_month(y, m)

    for (fy, fm) in future_dates:
        new_row = {}
        new_row['YEAR'] = fy
        new_row['Month_Num'] = fm
        new_row['Time'] = history['Time'].iloc[-1] + 1

        # static base features: carry forward last known value or compute simple time features
        for bf in base_features:
            if bf in history.columns:
                new_row[bf] = history[bf].iloc[-1]
            else:
                if bf == 'Month_sin':
                    new_row['Month_sin'] = np.sin(2*np.pi*(fm/12))
                elif bf == 'Month_cos':
                    new_row['Month_cos'] = np.cos(2*np.pi*(fm/12))
                elif bf == 'YEAR':
                    new_row['YEAR'] = fy
                else:
                    new_row[bf] = np.nan

        # compute Rainfall lag features
        for lag_col in rain_lags:
            m_l = re.search(r'lag[_\-]?(\d+)', lag_col, flags=re.I)
            if m_l:
                k = int(m_l.group(1))
                if len(history) >= k:
                    new_row[lag_col] = history['Rainfall'].iloc[-k]
                else:
                    new_row[lag_col] = history['Rainfall'].iloc[0]
            else:
                new_row[lag_col] = history[lag_col].iloc[-1] if lag_col in history.columns else np.nan

        # compute Temperature lag features
        for lag_col in temp_lags:
            m_l = re.search(r'lag[_\-]?(\d+)', lag_col, flags=re.I)
            if m_l:
                k = int(m_l.group(1))
                if len(history) >= k:
                    new_row[lag_col] = history['Temperature'].iloc[-k]
                else:
                    new_row[lag_col] = history['Temperature'].iloc[0]
            else:
                new_row[lag_col] = history[lag_col].iloc[-1] if lag_col in history.columns else np.nan

        # rolling features for Rainfall
        for roll_col in rain_rolls:
            mm = re.search(r'roll[_\-]?(\d+)', roll_col, flags=re.I)
            if mm:
                w = int(mm.group(1))
                vals = history['Rainfall'].iloc[-w:] if len(history) >= w else history['Rainfall']
                new_row[roll_col] = vals.mean() if len(vals)>0 else history['Rainfall'].iloc[-1]
            else:
                new_row[roll_col] = history[roll_col].iloc[-1] if roll_col in history.columns else np.nan

        # rolling features for Temperature
        for roll_col in temp_rolls:
            mm = re.search(r'roll[_\-]?(\d+)', roll_col, flags=re.I)
            if mm:
                w = int(mm.group(1))
                vals = history['Temperature'].iloc[-w:] if len(history) >= w else history['Temperature']
                new_row[roll_col] = vals.mean() if len(vals)>0 else history['Temperature'].iloc[-1]
            else:
                new_row[roll_col] = history[roll_col].iloc[-1] if roll_col in history.columns else np.nan

        # Build X row for prediction
        X_row = []
        for col in feature_cols:
            X_row.append(new_row.get(col, np.nan))
        X_row_df = pd.DataFrame([X_row], columns=feature_cols)

        # Predict
        p_r = rf_rain_reg.predict(X_row_df)[0]
        p_t = rf_temp_reg.predict(X_row_df)[0]
        p_r = max(0.0, p_r)

        new_row['Rainfall'] = p_r
        new_row['Temperature'] = p_t

        if 'Month_sin' in feature_cols and 'Month_sin' not in new_row:
            new_row['Month_sin'] = np.sin(2*np.pi*(fm/12))
        if 'Month_cos' in feature_cols and 'Month_cos' not in new_row:
            new_row['Month_cos'] = np.cos(2*np.pi*(fm/12))

        # Append new_row to history
        history = pd.concat([history, pd.DataFrame([new_row])], ignore_index=True, sort=False)

        # Save final row to results
        date = pd.Timestamp(year=int(fy), month=int(fm), day=1)
        final_rows.append({
            'DATE': date,
            'REGION': region,
            'Predicted_Rainfall': p_r,
            'Predicted_Temperature': p_t
        })

# Save final forecasts
os.makedirs(models_dir, exist_ok=True)
final_df = pd.DataFrame(final_rows)
final_df = final_df.sort_values(['REGION','DATE']).reset_index(drop=True)
out_path = os.path.join(base_dir, 'final_forecasts_to_2030.csv')
final_df.to_csv(out_path, index=False)
print('Saved forecasts to', out_path)


Base features (kept static): ['Month_sin', 'Month_cos']
Rainfall lag cols found: ['Rainfall_lag_1', 'Rainfall_lag_12', 'Rainfall_lag_2', 'Rainfall_lag_3']
Rainfall roll cols found: ['Rainfall_roll12', 'Rainfall_roll3']
Temperature lag cols found: ['Temperature_lag_1', 'Temperature_lag_12', 'Temperature_lag_2', 'Temperature_lag_3']
Temperature roll cols found: ['Temperature_roll12', 'Temperature_roll3']
Saved per-region models for Central -> ..\4_data_analysis\model_datasets\models\rf_rain_Central.joblib ..\4_data_analysis\model_datasets\models\rf_temp_Central.joblib
Saved per-region models for Central -> ..\4_data_analysis\model_datasets\models\rf_rain_Central.joblib ..\4_data_analysis\model_datasets\models\rf_temp_Central.joblib
Saved per-region models for East -> ..\4_data_analysis\model_datasets\models\rf_rain_East.joblib ..\4_data_analysis\model_datasets\models\rf_temp_East.joblib
Saved per-region models for East -> ..\4_data_analysis\model_datasets\models\rf_rain_East.joblib ..\4_