In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv("../data/transactions_regenerated_ROUNDED.csv")

print("Shape:", df.shape)
df.head()

Shape: (4721, 12)


Unnamed: 0,RentalID,Date,Year,Month,IsSeason,IsWeekend,VehicleType,RentalDays,DailyPriceLKR,TotalPriceLKR,CustomerType,Notes
0,1,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
1,2,2023-01-01,2023,1,1,1,Car,1,9500,9500,,
2,3,2023-01-01,2023,1,1,1,Tuk Tuk,1,4000,4000,,
3,4,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
4,5,2023-01-01,2023,1,1,1,Car,2,7500,15000,,


In [22]:
#Checking for missing values
df.isna().sum()

(df.isna().mean() * 100).round(2)

RentalID          0.00
Date              0.00
Year              0.00
Month             0.00
IsSeason          0.00
IsWeekend         0.00
VehicleType       0.00
RentalDays        0.00
DailyPriceLKR     0.00
TotalPriceLKR     0.00
CustomerType     99.39
Notes            99.77
dtype: float64

In [23]:
# Parse date
df["Date"] = pd.to_datetime(df["Date"])

# Demand target = bookings count (each row = one booking)
demand = (
    df.groupby([df["Date"].dt.date, "VehicleType"])
      .size()
      .reset_index(name="BookingsCount")
)

print("Demand shape (only existing combos):", demand.shape)
demand.head()

Demand shape (only existing combos): (2216, 3)


Unnamed: 0,Date,VehicleType,BookingsCount
0,2023-01-01,Bike,1
1,2023-01-01,Car,5
2,2023-01-01,Tuk Tuk,1
3,2023-01-02,Bike,5
4,2023-01-02,Car,1


In [24]:
#Add missing (Date × VehicleType) combos as 0 bookings
all_dates = pd.date_range(df["Date"].min(), df["Date"].max(), freq="D").date
vehicle_types = sorted(df["VehicleType"].dropna().unique().tolist())

full_grid = pd.MultiIndex.from_product(
    [all_dates, vehicle_types],
    names=["Date", "VehicleType"]
).to_frame(index=False)

demand_full = (
    full_grid.merge(demand, on=["Date", "VehicleType"], how="left")
             .fillna({"BookingsCount": 0})
)

print("Demand FULL shape (with zeros):", demand_full.shape)
demand_full.head()

Demand FULL shape (with zeros): (3453, 3)


Unnamed: 0,Date,VehicleType,BookingsCount
0,2023-01-01,Bike,1.0
1,2023-01-01,Car,5.0
2,2023-01-01,Tuk Tuk,1.0
3,2023-01-02,Bike,5.0
4,2023-01-02,Car,1.0


In [25]:
demand_full["Date"] = pd.to_datetime(demand_full["Date"])
demand_full["Year"] = demand_full["Date"].dt.year
demand_full["Month"] = demand_full["Date"].dt.month
demand_full["DayOfWeek"] = demand_full["Date"].dt.weekday
demand_full["IsWeekend"] = (demand_full["DayOfWeek"] >= 5).astype(int)

# Season: Oct–Mar
season_months = [10, 11, 12, 1, 2, 3]
demand_full["IsSeason"] = demand_full["Month"].isin(season_months).astype(int)

demand_full.head()

Unnamed: 0,Date,VehicleType,BookingsCount,Year,Month,DayOfWeek,IsWeekend,IsSeason
0,2023-01-01,Bike,1.0,2023,1,6,1,1
1,2023-01-01,Car,5.0,2023,1,6,1,1
2,2023-01-01,Tuk Tuk,1.0,2023,1,6,1,1
3,2023-01-02,Bike,5.0,2023,1,0,0,1
4,2023-01-02,Car,1.0,2023,1,0,0,1


In [26]:
X = demand_full[["Year", "Month", "DayOfWeek", "IsSeason", "IsWeekend", "VehicleType"]].copy()
y = demand_full["BookingsCount"].astype(float)

X = pd.get_dummies(X, columns=["VehicleType"], drop_first=False)

print("X shape:", X.shape)
print("y shape:", y.shape)
X.head()

X shape: (3453, 8)
y shape: (3453,)


Unnamed: 0,Year,Month,DayOfWeek,IsSeason,IsWeekend,VehicleType_Bike,VehicleType_Car,VehicleType_Tuk Tuk
0,2023,1,6,1,1,True,False,False
1,2023,1,6,1,1,False,True,False
2,2023,1,6,1,1,False,False,True
3,2023,1,0,1,0,True,False,False
4,2023,1,0,1,0,False,True,False


In [27]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=100,
    shuffle=True
)

print("Train:", x_train.shape)
print("Test :", x_test.shape)

Train: (2762, 8)
Test : (691, 8)


In [28]:
# 1) Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)
pred_lr = lr.predict(x_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, pred_lr))
mae_lr  = mean_absolute_error(y_test, pred_lr)
r2_lr   = r2_score(y_test, pred_lr)
print("Linear Regression")
print("RMSE:", rmse_lr)
print("MAE :", mae_lr)
print("R2  :", r2_lr)

Linear Regression
RMSE: 1.3272193657633755
MAE : 0.9337741513861387
R2  : 0.3456440873697447


In [29]:
# 2) Decision Tree
dt = DecisionTreeRegressor(random_state=100)
dt.fit(x_train, y_train)
pred_dt = dt.predict(x_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, pred_dt))
mae_dt  = mean_absolute_error(y_test, pred_dt)
r2_dt   = r2_score(y_test, pred_dt)

print("\nDecision Tree")
print("RMSE:", rmse_dt)
print("MAE :", mae_dt)
print("R2  :", r2_dt)


Decision Tree
RMSE: 1.0506737935656378
MAE : 0.6495417269657502
R2  : 0.589923944824448


In [30]:
# 3) Random Forest
rf = RandomForestRegressor(random_state=100, n_estimators=300)
rf.fit(x_train, y_train)
pred_rf = rf.predict(x_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
mae_rf  = mean_absolute_error(y_test, pred_rf)
r2_rf   = r2_score(y_test, pred_rf)

print("\nRandom Forest")
print("RMSE:", rmse_rf)
print("MAE :", mae_rf)
print("R2  :", r2_rf)


Random Forest
RMSE: 1.0220934333002791
MAE : 0.6436654318774434
R2  : 0.6119302354040752


In [31]:
# 4) KNN 
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor(n_neighbors=8))
])
knn.fit(x_train, y_train)
pred_knn = knn.predict(x_test)

rmse_knn = np.sqrt(mean_squared_error(y_test, pred_knn))
mae_knn  = mean_absolute_error(y_test, pred_knn)
r2_knn   = r2_score(y_test, pred_knn)

print("\nKNN Regressor")
print("RMSE:", rmse_knn)
print("MAE :", mae_knn)
print("R2  :", r2_knn)


KNN Regressor
RMSE: 0.9457757021231215
MAE : 0.6219247467438495
R2  : 0.667719453689527


In [32]:
# 5) SVM (needs scaling)
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR(C=10, epsilon=0.1, kernel="rbf"))
])
svm.fit(x_train, y_train)
pred_svm = svm.predict(x_test)

rmse_svm = np.sqrt(mean_squared_error(y_test, pred_svm))
mae_svm  = mean_absolute_error(y_test, pred_svm)
r2_svm   = r2_score(y_test, pred_svm)

print("\nSVM Regressor")
print("RMSE:", rmse_svm)
print("MAE :", mae_svm)
print("R2  :", r2_svm)


SVM Regressor
RMSE: 0.9355146899105099
MAE : 0.583270371314013
R2  : 0.6748903699844324


In [33]:
results = pd.DataFrame({
    "Model": ["LinearRegression", "DecisionTree", "RandomForest", "KNN", "SVM"],
    "RMSE": [rmse_lr, rmse_dt, rmse_rf, rmse_knn, rmse_svm],
    "MAE":  [mae_lr,  mae_dt,  mae_rf,  mae_knn,  mae_svm],
    "R2":   [r2_lr,   r2_dt,   r2_rf,   r2_knn,   r2_svm]
}).sort_values("R2", ascending=False)

print(results)

best_name = results.iloc[0]["Model"]
print("\nBest model is -", best_name)

model_map = {
    "LinearRegression": lr,
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": knn,
    "SVM": svm
}

best_model = model_map[best_name]

              Model      RMSE       MAE        R2
4               SVM  0.935515  0.583270  0.674890
3               KNN  0.945776  0.621925  0.667719
2      RandomForest  1.022093  0.643665  0.611930
1      DecisionTree  1.050674  0.649542  0.589924
0  LinearRegression  1.327219  0.933774  0.345644

Best model is - SVM


In [34]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    best_model,
    x_train,
    y_train,
    cv=5,
    scoring="r2"
)

print("Cross Validation (R2) Scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())
print("Std CV R2 :", cv_scores.std())

Cross Validation (R2) Scores: [0.70197129 0.70623214 0.63456525 0.70552671 0.69059253]
Mean CV R2: 0.6877775851459397
Std CV R2 : 0.027190218120248414


In [35]:
from sklearn.model_selection import GridSearchCV

tuned_model = best_model  # default

if best_name == "RandomForest":
    param_grid = {
        "n_estimators": [200, 400, 600],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=RandomForestRegressor(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned RandomForest")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "DecisionTree":
    param_grid = {
        "max_depth": [None, 3, 5, 10, 15],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=DecisionTreeRegressor(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned DecisionTree")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "KNN":
    param_grid = {
        "model__n_neighbors": [3, 5, 7, 9, 11, 15],
        "model__weights": ["uniform", "distance"]
    }

    grid = GridSearchCV(
        estimator=knn,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned KNN")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "SVM":
    param_grid = {
        "model__C": [1, 10, 50],
        "model__epsilon": [0.05, 0.1, 0.2],
        "model__gamma": ["scale", "auto"]
    }

    grid = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned SVM")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

best_model_final = tuned_model

✅ Tuned SVM
Best Params: {'model__C': 10, 'model__epsilon': 0.2, 'model__gamma': 'scale'}
Best CV R2 : 0.690521331078056


In [36]:
final_pred = best_model_final.predict(x_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae  = mean_absolute_error(y_test, final_pred)
final_r2   = r2_score(y_test, final_pred)

print("\n✅ FINAL MODEL:", best_name)
print("FINAL RMSE:", final_rmse)
print("FINAL MAE :", final_mae)
print("FINAL R2  :", final_r2)


✅ FINAL MODEL: SVM
FINAL RMSE: 0.9366635093838941
FINAL MAE : 0.6187501713045997
FINAL R2  : 0.6740914052996074


In [37]:
import joblib

bundle = {
    "model": best_model_final,
    "features": list(X.columns),  # IMPORTANT: includes VehicleType dummy columns
    "vehicle_types": vehicle_types,
    "season_months": season_months,
    "metrics": {
        "rmse": float(final_rmse),
        "mae": float(final_mae),
        "r2": float(final_r2)
    },
    "random_state": 100,
    "split": "train_test_split 80/20",
    "target": "BookingsCount"
}

joblib.dump(bundle, "../models/best_demand_model.pkl")
print("✅ Saved -> ../models/best_demand_model.pkl")

✅ Saved -> ../models/best_demand_model.pkl
