In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
df = pd.read_csv("../data/ev_charging_data.csv")

# Create full datetime
df["datetime"] = pd.to_datetime(df["date"]) + pd.to_timedelta(df["hour"], unit="h")

# Sort per station chronologically
df = df.sort_values(["station_id", "datetime"])

df.head()


Unnamed: 0,date,hour,station_id,energy_kwh,datetime
0,2025-01-01,0,Station_1,10.63,2025-01-01 00:00:00
5,2025-01-01,1,Station_1,11.02,2025-01-01 01:00:00
10,2025-01-01,2,Station_1,7.88,2025-01-01 02:00:00
15,2025-01-01,3,Station_1,6.26,2025-01-01 03:00:00
20,2025-01-01,4,Station_1,13.86,2025-01-01 04:00:00


In [4]:
df["lag_1"] = df.groupby("station_id")["energy_kwh"].shift(1)
df["lag_24"] = df.groupby("station_id")["energy_kwh"].shift(24)
df["rolling_mean_3"] = (
    df.groupby("station_id")["energy_kwh"]
    .rolling(3)
    .mean()
    .reset_index(level=0, drop=True)
)
df["day_of_week"] = df["datetime"].dt.weekday
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
df = df.dropna()

df.head()



Unnamed: 0,date,hour,station_id,energy_kwh,datetime,lag_1,lag_24,rolling_mean_3,day_of_week,is_weekend
120,2025-01-02,0,Station_1,6.82,2025-01-02 00:00:00,14.85,10.63,10.72,3,0
125,2025-01-02,1,Station_1,12.85,2025-01-02 01:00:00,6.82,11.02,11.506667,3,0
130,2025-01-02,2,Station_1,17.87,2025-01-02 02:00:00,12.85,7.88,12.513333,3,0
135,2025-01-02,3,Station_1,10.41,2025-01-02 03:00:00,17.87,6.26,13.71,3,0
140,2025-01-02,4,Station_1,14.61,2025-01-02 04:00:00,10.41,13.86,14.296667,3,0


In [5]:
df = pd.get_dummies(df, columns=["station_id"], drop_first=True)

df.head()


Unnamed: 0,date,hour,energy_kwh,datetime,lag_1,lag_24,rolling_mean_3,day_of_week,is_weekend,station_id_Station_2,station_id_Station_3,station_id_Station_4,station_id_Station_5
120,2025-01-02,0,6.82,2025-01-02 00:00:00,14.85,10.63,10.72,3,0,False,False,False,False
125,2025-01-02,1,12.85,2025-01-02 01:00:00,6.82,11.02,11.506667,3,0,False,False,False,False
130,2025-01-02,2,17.87,2025-01-02 02:00:00,12.85,7.88,12.513333,3,0,False,False,False,False
135,2025-01-02,3,10.41,2025-01-02 03:00:00,17.87,6.26,13.71,3,0,False,False,False,False
140,2025-01-02,4,14.61,2025-01-02 04:00:00,10.41,13.86,14.296667,3,0,False,False,False,False


In [6]:
feature_cols = [
    "hour",
    "day_of_week",
    "is_weekend",
    "lag_1",
    "lag_24",
    "rolling_mean_3"
] + [col for col in df.columns if col.startswith("station_id_")]

X = df[feature_cols]
y = df["energy_kwh"]


In [7]:
split_index = int(len(df) * 0.8)

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]

y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]


In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("Linear Regression")
print("MAE:", round(mae_lr, 2))
print("RMSE:", round(rmse_lr, 2))


Linear Regression
MAE: 3.32
RMSE: 4.22


In [9]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("\nRandom Forest")
print("MAE:", round(mae_rf, 2))
print("RMSE:", round(rmse_rf, 2))



Random Forest
MAE: 2.35
RMSE: 2.98


In [10]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [mae_lr, mae_rf],
    "RMSE": [rmse_lr, rmse_rf]
})

results


Unnamed: 0,Model,MAE,RMSE
0,Linear Regression,3.316649,4.222276
1,Random Forest,2.350784,2.977188


In [11]:
joblib.dump(rf, "../models/ev_demand_model.pkl")
print("Model saved successfully.")


Model saved successfully.
