In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [20]:
df = pd.read_csv("../data/transactions_regenerated_ROUNDED.csv")
df.columns = df.columns.str.strip()

print("Shape:", df.shape)
df.head()

Shape: (4721, 12)


Unnamed: 0,RentalID,Date,Year,Month,IsSeason,IsWeekend,VehicleType,RentalDays,DailyPriceLKR,TotalPriceLKR,CustomerType,Notes
0,1,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
1,2,2023-01-01,2023,1,1,1,Car,1,9500,9500,,
2,3,2023-01-01,2023,1,1,1,Tuk Tuk,1,4000,4000,,
3,4,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
4,5,2023-01-01,2023,1,1,1,Car,2,7500,15000,,


In [21]:
print(df.columns)
df.info()

Index(['RentalID', 'Date', 'Year', 'Month', 'IsSeason', 'IsWeekend',
       'VehicleType', 'RentalDays', 'DailyPriceLKR', 'TotalPriceLKR',
       'CustomerType', 'Notes'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4721 entries, 0 to 4720
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   RentalID       4721 non-null   int64 
 1   Date           4721 non-null   object
 2   Year           4721 non-null   int64 
 3   Month          4721 non-null   int64 
 4   IsSeason       4721 non-null   int64 
 5   IsWeekend      4721 non-null   int64 
 6   VehicleType    4721 non-null   object
 7   RentalDays     4721 non-null   int64 
 8   DailyPriceLKR  4721 non-null   int64 
 9   TotalPriceLKR  4721 non-null   int64 
 10  CustomerType   29 non-null     object
 11  Notes          11 non-null     object
dtypes: int64(8), object(4)
memory usage: 442.7+ KB


In [22]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date", "TotalPriceLKR", "VehicleType"]).copy()
df.head()

Unnamed: 0,RentalID,Date,Year,Month,IsSeason,IsWeekend,VehicleType,RentalDays,DailyPriceLKR,TotalPriceLKR,CustomerType,Notes
0,1,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
1,2,2023-01-01,2023,1,1,1,Car,1,9500,9500,,
2,3,2023-01-01,2023,1,1,1,Tuk Tuk,1,4000,4000,,
3,4,2023-01-01,2023,1,1,1,Car,4,7500,30000,,
4,5,2023-01-01,2023,1,1,1,Car,2,7500,15000,,


In [23]:
#Encode VehicleType
from sklearn.preprocessing import LabelEncoder

le_vehicle = LabelEncoder()
df["VehicleEncoded"] = le_vehicle.fit_transform(df["VehicleType"])

print("Vehicle types:", list(le_vehicle.classes_))
df[["VehicleType", "VehicleEncoded"]].head()

Vehicle types: ['Bike', 'Car', 'Tuk Tuk']


Unnamed: 0,VehicleType,VehicleEncoded
0,Car,1
1,Car,1
2,Tuk Tuk,2
3,Car,1
4,Car,1


In [24]:
x = df[["Year", "Month", "IsSeason", "IsWeekend", "VehicleEncoded"]]
y = df["TotalPriceLKR"]

In [25]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=100,
    shuffle=True
)

print("Train:", x_train.shape)
print("Test :", x_test.shape)

Train: (3776, 5)
Test : (945, 5)


In [26]:
results = []

def add_result(name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    results.append([name, rmse, mae, r2])
    print(f"{name} -> RMSE: {rmse:.2f} | MAE: {mae:.2f} | R2: {r2:.4f}")

In [27]:
#Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

pred_lr = lr.predict(x_test)
add_result("LinearRegression", y_test, pred_lr)

LinearRegression -> RMSE: 5592.43 | MAE: 3767.84 | R2: 0.4747


In [28]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=100)
dt.fit(x_train, y_train)

pred_dt = dt.predict(x_test)
add_result("DecisionTree", y_test, pred_dt)

DecisionTree -> RMSE: 4817.61 | MAE: 3226.44 | R2: 0.6102


In [29]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, random_state=100)
rf.fit(x_train, y_train)

pred_rf = rf.predict(x_test)
add_result("RandomForest", y_test, pred_rf)

RandomForest -> RMSE: 4822.37 | MAE: 3224.38 | R2: 0.6094


In [30]:
#KNN Regressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

knn.fit(x_train, y_train)

pred_knn = knn.predict(x_test)
add_result("KNN", y_test, pred_knn)

KNN -> RMSE: 5157.44 | MAE: 3385.08 | R2: 0.5533


In [31]:
#Support Vector Regressor
from sklearn.svm import SVR

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR())
])

svm.fit(x_train, y_train)

pred_svm = svm.predict(x_test)
add_result("SVR", y_test, pred_svm)

SVR -> RMSE: 7901.84 | MAE: 4987.81 | R2: -0.0487


In [32]:

results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2"]).sort_values("R2", ascending=False)
print(results_df)

best_name = results_df.iloc[0]["Model"]
print("\nBest model is -", best_name)

model_map = {
    "LinearRegression": lr,
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": knn,
    "SVR": svm
}
best_model = model_map[best_name]

              Model         RMSE          MAE        R2
1      DecisionTree  4817.608036  3226.440557  0.610192
2      RandomForest  4822.370013  3224.382606  0.609422
3               KNN  5157.435686  3385.079365  0.553260
0  LinearRegression  5592.425538  3767.840107  0.474724
4               SVR  7901.844457  4987.809418 -0.048683

Best model is - DecisionTree


In [33]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, x_train, y_train, cv=5, scoring="r2", n_jobs=-1)

print("CV R2 Scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())
print("Std  CV R2:", cv_scores.std())

CV R2 Scores: [0.60098698 0.57000264 0.6608636  0.55134401 0.59830868]
Mean CV R2: 0.596301182152458
Std  CV R2: 0.037167332939509315


In [34]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

tuned_model = best_model

if best_name == "RandomForest":
    grid = GridSearchCV(
        RandomForestRegressor(random_state=100),
        {
            "n_estimators": [200, 400, 600],
            "max_depth": [None, 6, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("RF Best Params:", grid.best_params_)
    print("RF Best CV R2 :", grid.best_score_)

elif best_name == "DecisionTree":
    grid = GridSearchCV(
        DecisionTreeRegressor(random_state=100),
        {
            "max_depth": [None, 4, 6, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("DT Best Params:", grid.best_params_)
    print("DT Best CV R2 :", grid.best_score_)

elif best_name == "KNN":
    grid = GridSearchCV(
        knn,
        {
            "model__n_neighbors": [3, 5, 7, 9, 11],
            "model__weights": ["uniform", "distance"]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("KNN Best Params:", grid.best_params_)
    print("KNN Best CV R2 :", grid.best_score_)

elif best_name == "SVR":
    grid = GridSearchCV(
        svm,
        {
            "model__C": [0.1, 1, 10],
            "model__gamma": ["scale", "auto"],
            "model__epsilon": [0.1, 0.2, 0.5]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("SVR Best Params:", grid.best_params_)
    print("SVR Best CV R2 :", grid.best_score_)

else:
    print("LinearRegression: no hyperparameter tuning required.")

DT Best Params: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
DT Best CV R2 : 0.6239377355896536


In [35]:
final_pred = tuned_model.predict(x_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae  = mean_absolute_error(y_test, final_pred)
final_r2   = r2_score(y_test, final_pred)

print("FINAL RMSE:", final_rmse)
print("FINAL MAE :", final_mae)
print("FINAL R2  :", final_r2)

FINAL RMSE: 4743.973286156938
FINAL MAE : 3191.2378824109965
FINAL R2  : 0.6220174599814513


In [36]:
import joblib

bundle = {
    "model": tuned_model,
    "features": ["Year", "Month", "IsSeason", "IsWeekend", "VehicleEncoded"],
    "label_encoder": le_vehicle,
    "vehicle_classes": list(le_vehicle.classes_),
    "metrics": {"rmse": float(final_rmse), "mae": float(final_mae), "r2": float(final_r2)},
    "random_state": 100,
    "target": "TotalPriceLKR"
}

joblib.dump(bundle, "../models/best_vehicle_revenue_model.pkl")
print("Saved -> ../models/best_vehicle_revenue_model.pkl")

Saved -> ../models/best_vehicle_revenue_model.pkl
