In [43]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error

In [44]:
# Load dataset
df = pd.read_csv("uganda_monthly_rainfall_and_climate_variables_1981_2025.csv")

df["date"] = pd.to_datetime(df["date"])

df = df.sort_values(by=["district", "date"])

# Extract year and month
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month

df["season"] = df["month"].map(
    {
        1: "DJF",
        2: "DJF",
        12: "DJF",
        3: "MAM",
        4: "MAM",
        5: "MAM",
        6: "JJA",
        7: "JJA",
        8: "JJA",
        9: "SON",
        10: "SON",
        11: "SON",
    }
)

# Convert 'MAM' to columns like season_MAM = 1
df = pd.get_dummies(df, columns=["season"], prefix="season", dtype=int)

print("Temporal features added successfully.")

Temporal features added successfully.


In [45]:
# Create lag features for rainfall
df["rain_lag_1"] = df.groupby("district")["rainfall_mm"].shift(1)
df["rain_lag_3"] = df.groupby("district")["rainfall_mm"].shift(3)

print("Lag features added successfully.")

Lag features added successfully.


In [46]:
# Create rolling mean features for rainfall
df["rainfall_rolling_mean_3"] = df.groupby("district")["rainfall_mm"].transform(
    lambda x: x.shift(1).rolling(window=3).mean()
)

print("Rolling mean features added successfully.")

Rolling mean features added successfully.


In [47]:
# Drop rows with NaN values created by lagging
df = df.dropna().reset_index(drop=True)

df.head()

Unnamed: 0,district,region,date,rainfall_mm,dewpoint_temperature,temperature,surface_pressure_pa,total_cloud_cover,wind_u_component,wind_v_component,year,month,season_DJF,season_JJA,season_MAM,season_SON,rain_lag_1,rain_lag_3,rainfall_rolling_mean_3
0,Abim,Northern,1981-04-01,133.041992,18.43658,23.10357,88423.49,0.820665,-1.020124,0.326704,1981,4,0,0,1,0,104.91126,7.921172,43.874615
1,Abim,Northern,1981-05-01,100.246476,18.3307,22.67858,88494.125,0.695249,-0.429709,0.58236,1981,5,0,0,1,0,133.041992,18.791414,85.581555
2,Abim,Northern,1981-06-01,106.520529,17.52154,22.47027,88612.555,0.51856,-0.313048,0.225815,1981,6,0,1,0,0,100.246476,104.91126,112.733243
3,Abim,Northern,1981-07-01,216.51484,17.64462,21.34826,88601.79,0.641735,-0.430527,-0.180639,1981,7,0,1,0,0,106.520529,133.041992,113.269666
4,Abim,Northern,1981-08-01,200.59392,17.6563,21.72808,88544.52,0.624331,-0.553444,-0.196151,1981,8,0,1,0,0,216.51484,100.246476,141.093948


In [48]:
features = [
    "dewpoint_temperature",
    "temperature",
    "surface_pressure_pa",
    "total_cloud_cover",
    "wind_u_component",
    "wind_v_component",
    "year",
    "month",
    "rain_lag_1",
    "rain_lag_3",
    "rainfall_rolling_mean_3",
    "season_DJF",
    "season_JJA",
    "season_MAM",
    "season_SON",
]

target = "rainfall_mm"

train = df[df["year"] <= 2018]
test = df[df["year"] > 2018]

print(
    f"Training years: {train['year'].min()} - {train['year'].max()} (Rows: {len(train)})"
)
print(f"Testing years: {test['year'].min()} - {test['year'].max()} (Rows: {len(test)})")

Training years: 1981 - 2018 (Rows: 54360)
Testing years: 2019 - 2025 (Rows: 9840)


In [49]:
# Initialize the Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train
print("Training Random Forest...")
print(train.columns.tolist())
rf_model.fit(train[features], train[target])

# Predict
predictions = rf_model.predict(test[features])

# Evaluate
mse = mean_squared_error(test[target], predictions)
rmse = np.sqrt(mse)

print(f"Model Performance (RMSE): {rmse:.2f} mm")
print("Interpretation: On average, predictions are off by +/- {:.2f} mm".format(rmse))

baseline_rmse = np.sqrt(
    mean_squared_error(test[target], [train[target].mean()] * len(test))
)
print(f"Baseline (Guessing Average) RMSE: {baseline_rmse:.2f} mm")

Training Random Forest...
['district', 'region', 'date', 'rainfall_mm', 'dewpoint_temperature', 'temperature', 'surface_pressure_pa', 'total_cloud_cover', 'wind_u_component', 'wind_v_component', 'year', 'month', 'season_DJF', 'season_JJA', 'season_MAM', 'season_SON', 'rain_lag_1', 'rain_lag_3', 'rainfall_rolling_mean_3']


Model Performance (RMSE): 43.20 mm
Interpretation: On average, predictions are off by +/- 43.20 mm
Baseline (Guessing Average) RMSE: 68.15 mm


In [50]:
# Calaculate R² score
r2 = r2_score(test[target], predictions)

mae = mean_absolute_error(test[target], predictions)

# Calculate Pseudo Accuracy
mean_actual_rainfall = test[target].mean()
pseudo_accuracy = 100 * (1 - (mae / mean_actual_rainfall))
print(f"Pseudo Accuracy: {pseudo_accuracy:.1f}%")
print(f"Model Accuracy: {r2:.2%}")
print(f"Mean Absolute Error (MAE): {mae:.2f} mm")

Pseudo Accuracy: 72.2%
Model Accuracy: 59.39%
Mean Absolute Error (MAE): 30.50 mm


In [51]:
start_date = pd.to_datetime("2025-10-01")
months_to_forecast = pd.date_range(start="2025-11-01", periods=14, freq="MS")

districts = df["district"].unique()
forecasts = []

print(
    f"Forecasting rainfall for {len(districts)} starting from {start_date.date()} for {len(months_to_forecast)} months."
)

# Loop through every district
for district in districts:
    district_data = (
        df[(df["district"] == district) & (df["date"] <= start_date)]
        .sort_values(by="date")
        .tail(3)
    )

    # if district is missing data, skip
    if len(district_data) < 3:
        print(f"Skipping district {district} due to insufficient data.")
        continue

    region_name = district_data["region"].iloc[0]

    # Initialize memory with last 3 months of rainfall
    rainfall_memory = district_data["rainfall_mm"].tolist()

    # Get monthly averages for the district
    monthly_stats = (
        df[df["district"] == district].groupby("month").mean(numeric_only=True)
    )

    for forecast_date in months_to_forecast:
        month_num = forecast_date.month

        # Get seasonal data
        avg_stats = monthly_stats.loc[month_num]
        season_cat = (
            "DJF"
            if month_num in [12, 1, 2]
            else "MAM"
            if month_num in [3, 4, 5]
            else "JJA"
            if month_num in [6, 7, 8]
            else "SON"
        )

        # Calculate lags from memory
        current_lag_1 = rainfall_memory[-1]
        current_lag_3 = rainfall_memory[-3]
        current_rolling_mean_3 = np.mean(rainfall_memory[-3:])

        # Create feature row
        feature_row = {
            "dewpoint_temperature": avg_stats["dewpoint_temperature"],
            "temperature": avg_stats["temperature"],
            "surface_pressure_pa": avg_stats["surface_pressure_pa"],
            "total_cloud_cover": avg_stats["total_cloud_cover"],
            "wind_u_component": avg_stats["wind_u_component"],
            "wind_v_component": avg_stats["wind_v_component"],
            "year": forecast_date.year,
            "month": month_num,
            "rain_lag_1": current_lag_1,
            "rain_lag_3": current_lag_3,
            "rainfall_rolling_mean_3": current_rolling_mean_3,
            "season_DJF": 1 if season_cat == "DJF" else 0,
            "season_MAM": 1 if season_cat == "MAM" else 0,
            "season_JJA": 1 if season_cat == "JJA" else 0,
            "season_SON": 1 if season_cat == "SON" else 0,
        }

        feature_df = pd.DataFrame([feature_row])

        # Predict rainfall
        predicted_rainfall = rf_model.predict(feature_df[features])[0]
        predicted_rainfall = max(predicted_rainfall, 0)

        forecasts.append(
            {
                "district": district,
                "region": region_name,
                "date": forecast_date,
                "predicted_rainfall_mm": predicted_rainfall,
            }
        )
        rainfall_memory.append(predicted_rainfall)
        print("Phase 1 complete for", forecast_date.date())

        # update memory
        rainfall_memory.append(predicted_rainfall)

# Display forecasts
forecast_df = pd.DataFrame(forecasts)
forecast_df

Forecasting rainfall for 120 starting from 2025-10-01 for 14 months.
Phase 1 complete for 2025-11-01
Phase 1 complete for 2025-12-01
Phase 1 complete for 2026-01-01
Phase 1 complete for 2026-02-01
Phase 1 complete for 2026-03-01
Phase 1 complete for 2026-04-01
Phase 1 complete for 2026-05-01
Phase 1 complete for 2026-06-01
Phase 1 complete for 2026-07-01
Phase 1 complete for 2026-08-01
Phase 1 complete for 2026-09-01
Phase 1 complete for 2026-10-01
Phase 1 complete for 2026-11-01
Phase 1 complete for 2026-12-01
Phase 1 complete for 2025-11-01
Phase 1 complete for 2025-12-01
Phase 1 complete for 2026-01-01
Phase 1 complete for 2026-02-01
Phase 1 complete for 2026-03-01
Phase 1 complete for 2026-04-01
Phase 1 complete for 2026-05-01
Phase 1 complete for 2026-06-01
Phase 1 complete for 2026-07-01
Phase 1 complete for 2026-08-01
Phase 1 complete for 2026-09-01
Phase 1 complete for 2026-10-01
Phase 1 complete for 2026-11-01
Phase 1 complete for 2026-12-01
Phase 1 complete for 2025-11-01
Pha

Unnamed: 0,district,region,date,predicted_rainfall_mm
0,Abim,Northern,2025-11-01,62.380427
1,Abim,Northern,2025-12-01,18.433460
2,Abim,Northern,2026-01-01,9.236691
3,Abim,Northern,2026-02-01,27.251833
4,Abim,Northern,2026-03-01,62.001633
...,...,...,...,...
1675,Zombo,Northern,2026-08-01,141.116491
1676,Zombo,Northern,2026-09-01,147.382386
1677,Zombo,Northern,2026-10-01,165.604378
1678,Zombo,Northern,2026-11-01,105.866344


In [52]:
# Format date as YYYY-MM
forecast_df["date"] = pd.to_datetime(forecast_df["date"])
forecast_df["date"] = forecast_df["date"].dt.to_period("M").astype(str)

forecast_df

Unnamed: 0,district,region,date,predicted_rainfall_mm
0,Abim,Northern,2025-11,62.380427
1,Abim,Northern,2025-12,18.433460
2,Abim,Northern,2026-01,9.236691
3,Abim,Northern,2026-02,27.251833
4,Abim,Northern,2026-03,62.001633
...,...,...,...,...
1675,Zombo,Northern,2026-08,141.116491
1676,Zombo,Northern,2026-09,147.382386
1677,Zombo,Northern,2026-10,165.604378
1678,Zombo,Northern,2026-11,105.866344


In [53]:
import joblib

# save the trained model
joblib.dump(rf_model, "uganda_rainfall_forecast_model.pkl")

['uganda_rainfall_forecast_model.pkl']

In [54]:
# save the forecasted results
forecast_df.to_csv("uganda_rainfall_forecasts_2025_2026.csv", index=False)