In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')



In [2]:
monthly_df = pd.read_csv("Monthly_Demand_Dataset.csv")

In [None]:
#  Sort first
monthly_df = monthly_df.sort_values(["state", "district", "month"])

#  Add time feature (month number)
monthly_df["month_num"] = pd.to_datetime(monthly_df["month"]).dt.month

#  Lag features (previous months)
monthly_df["lag_1"] = monthly_df.groupby(["state", "district"])["service_load"].shift(1)
monthly_df["lag_2"] = monthly_df.groupby(["state", "district"])["service_load"].shift(2)

#  Rolling mean (last 3 months average)
monthly_df["rolling_3_mean"] = (
    monthly_df.groupby(["state", "district"])["service_load"]
    .shift(1)
    .rolling(3)
    .mean()
)

#  Drop NaN rows created by lag/rolling
monthly_df = monthly_df.dropna(subset=["lag_1", "lag_2", "rolling_3_mean"])

print("✅ monthly_df shape after lag features:", monthly_df.shape)
monthly_df.head()


✅ monthly_df shape after lag features: (1061, 11)


Unnamed: 0,state,district,month,total_enroll,total_updates,service_load,next_month_service_load,month_num,lag_1,lag_2,rolling_3_mean
110,Andhra Pradesh,Prakasam,2025-11,2536,25768,28304,8472,11,21580.0,20763.0,14380.666667
149,Assam,Baksa,2025-06,52,62,114,393,6,285.0,2995.0,1106.0
150,Assam,Baksa,2025-07,220,173,393,3437,7,114.0,285.0,1131.333333
151,Assam,Baksa,2025-09,844,2593,3437,3461,9,393.0,114.0,264.0
152,Assam,Baksa,2025-10,671,2790,3461,3862,10,3437.0,393.0,1314.666667


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ✅ Improved feature set
X = monthly_df[
    [
        "total_enroll",
        "total_updates",
        "service_load",
        "month_num",
        "lag_1",
        "lag_2",
        "rolling_3_mean"
    ]
]

y = monthly_df["next_month_service_load"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
demand_model = RandomForestRegressor(
    n_estimators=400,        # more trees = better stability
    max_depth=20,           # controls overfitting
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

demand_model.fit(X_train, y_train)



✅ Improved Demand Prediction Model trained!


In [7]:
y_pred = demand_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("✅ Improved MAE :", mae)
print("✅ Improved RMSE:", rmse)
print("✅ Improved R2  :", r2)


✅ Improved MAE : 4666.572424617424
✅ Improved RMSE: 7869.509383119679
✅ Improved R2  : 0.7090034416199733


In [9]:
import joblib

joblib.dump(demand_model, "Demand_Prediction_Model.pkl")
print("✅ Saved: Demand_Prediction_Model.pkl")

✅ Saved: Demand_Prediction_Model.pkl
