In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [7]:
df = pd.read_csv("../data/rentals_transactions_realistic.csv")
df.columns = df.columns.str.strip()

print("Shape:", df.shape)
df.head()

Shape: (4721, 12)


Unnamed: 0,RentalID,Date,Year,Month,IsSeason,IsWeekend,VehicleType,RentalDays,DailyPriceLKR,TotalPriceLKR,CustomerType,Notes
0,1,2023-01-01,2023,1,1,1,Bike,4,1650.0,6600.0,,
1,2,2023-01-01,2023,1,1,1,Car,1,6050.0,6050.0,,
2,3,2023-01-01,2023,1,1,1,Car,1,6050.0,6050.0,,
3,4,2023-01-01,2023,1,1,1,Tuk Tuk,4,3300.0,13200.0,,
4,5,2023-01-01,2023,1,1,1,Bike,2,1650.0,3300.0,,


In [8]:
print(df.columns)
df.info()

Index(['RentalID', 'Date', 'Year', 'Month', 'IsSeason', 'IsWeekend',
       'VehicleType', 'RentalDays', 'DailyPriceLKR', 'TotalPriceLKR',
       'CustomerType', 'Notes'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4721 entries, 0 to 4720
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   RentalID       4721 non-null   int64  
 1   Date           4721 non-null   object 
 2   Year           4721 non-null   int64  
 3   Month          4721 non-null   int64  
 4   IsSeason       4721 non-null   int64  
 5   IsWeekend      4721 non-null   int64  
 6   VehicleType    4721 non-null   object 
 7   RentalDays     4721 non-null   int64  
 8   DailyPriceLKR  4721 non-null   float64
 9   TotalPriceLKR  4721 non-null   float64
 10  CustomerType   29 non-null     object 
 11  Notes          11 non-null     object 
dtypes: float64(2), int64(6), object(4)
memory usage: 442.7+ KB


In [9]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date", "TotalPriceLKR", "VehicleType"]).copy()
df.head()

Unnamed: 0,RentalID,Date,Year,Month,IsSeason,IsWeekend,VehicleType,RentalDays,DailyPriceLKR,TotalPriceLKR,CustomerType,Notes
0,1,2023-01-01,2023,1,1,1,Bike,4,1650.0,6600.0,,
1,2,2023-01-01,2023,1,1,1,Car,1,6050.0,6050.0,,
2,3,2023-01-01,2023,1,1,1,Car,1,6050.0,6050.0,,
3,4,2023-01-01,2023,1,1,1,Tuk Tuk,4,3300.0,13200.0,,
4,5,2023-01-01,2023,1,1,1,Bike,2,1650.0,3300.0,,


In [None]:
#Encode VehicleType
from sklearn.preprocessing import LabelEncoder

le_vehicle = LabelEncoder()
df["VehicleEncoded"] = le_vehicle.fit_transform(df["VehicleType"])

print("Vehicle types:", list(le_vehicle.classes_))
df[["VehicleType", "VehicleEncoded"]].head()

Vehicle types: ['Bike', 'Car', 'Tuk Tuk']


Unnamed: 0,VehicleType,VehicleEncoded
0,Bike,0
1,Car,1
2,Car,1
3,Tuk Tuk,2
4,Bike,0


In [11]:
x = df[["Year", "Month", "IsSeason", "IsWeekend", "VehicleEncoded"]]
y = df["TotalPriceLKR"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=100,
    shuffle=True
)

print("Train:", x_train.shape)
print("Test :", x_test.shape)

Train: (3776, 5)
Test : (945, 5)


In [13]:
results = []

def add_result(name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    results.append([name, rmse, mae, r2])
    print(f"{name} -> RMSE: {rmse:.2f} | MAE: {mae:.2f} | R2: {r2:.4f}")

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

pred_lr = lr.predict(x_test)
add_result("LinearRegression", y_test, pred_lr)

LinearRegression -> RMSE: 4908.81 | MAE: 3414.25 | R2: 0.2726


In [None]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=100)
dt.fit(x_train, y_train)

pred_dt = dt.predict(x_test)
add_result("DecisionTree", y_test, pred_dt)

DecisionTree -> RMSE: 3675.59 | MAE: 2623.83 | R2: 0.5922


In [None]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, random_state=100)
rf.fit(x_train, y_train)

pred_rf = rf.predict(x_test)
add_result("RandomForest", y_test, pred_rf)

RandomForest -> RMSE: 3678.28 | MAE: 2625.66 | R2: 0.5916


In [None]:
#KNN Regressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

knn.fit(x_train, y_train)

pred_knn = knn.predict(x_test)
add_result("KNN", y_test, pred_knn)

KNN -> RMSE: 3879.92 | MAE: 2687.14 | R2: 0.5456


In [None]:
#Support Vector Regressor
from sklearn.svm import SVR

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR())
])

svm.fit(x_train, y_train)

pred_svm = svm.predict(x_test)
add_result("SVR", y_test, pred_svm)

SVR -> RMSE: 5955.23 | MAE: 4009.41 | R2: -0.0705


In [None]:

results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2"]).sort_values("R2", ascending=False)
print(results_df)

best_name = results_df.iloc[0]["Model"]
print("\nBest model is -", best_name)

model_map = {
    "LinearRegression": lr,
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": knn,
    "SVR": svm
}
best_model = model_map[best_name]

              Model         RMSE          MAE        R2
1      DecisionTree  3675.585006  2623.828183  0.592186
2      RandomForest  3678.280728  2625.656112  0.591588
3               KNN  3879.915248  2687.142857  0.545584
0  LinearRegression  4908.811768  3414.254492  0.272619
4               SVR  5955.233180  4009.410531 -0.070550

Best model is - DecisionTree


In [20]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, x_train, y_train, cv=5, scoring="r2", n_jobs=-1)

print("CV R2 Scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())
print("Std  CV R2:", cv_scores.std())

CV R2 Scores: [0.5718765  0.56607288 0.58659159 0.52002674 0.55861436]
Mean CV R2: 0.5606364141441489
Std  CV R2: 0.022283628520551553


In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

tuned_model = best_model

if best_name == "RandomForest":
    grid = GridSearchCV(
        RandomForestRegressor(random_state=100),
        {
            "n_estimators": [200, 400, 600],
            "max_depth": [None, 6, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("RF Best Params:", grid.best_params_)
    print("RF Best CV R2 :", grid.best_score_)

elif best_name == "DecisionTree":
    grid = GridSearchCV(
        DecisionTreeRegressor(random_state=100),
        {
            "max_depth": [None, 4, 6, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("DT Best Params:", grid.best_params_)
    print("DT Best CV R2 :", grid.best_score_)

elif best_name == "KNN":
    grid = GridSearchCV(
        knn,
        {
            "model__n_neighbors": [3, 5, 7, 9, 11],
            "model__weights": ["uniform", "distance"]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("KNN Best Params:", grid.best_params_)
    print("KNN Best CV R2 :", grid.best_score_)

elif best_name == "SVR":
    grid = GridSearchCV(
        svm,
        {
            "model__C": [0.1, 1, 10],
            "model__gamma": ["scale", "auto"],
            "model__epsilon": [0.1, 0.2, 0.5]
        },
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_
    print("SVR Best Params:", grid.best_params_)
    print("SVR Best CV R2 :", grid.best_score_)

else:
    print("LinearRegression: no hyperparameter tuning required.")

DT Best Params: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
DT Best CV R2 : 0.5874145151598593


In [22]:
final_pred = tuned_model.predict(x_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae  = mean_absolute_error(y_test, final_pred)
final_r2   = r2_score(y_test, final_pred)

print("FINAL RMSE:", final_rmse)
print("FINAL MAE :", final_mae)
print("FINAL R2  :", final_r2)

FINAL RMSE: 3549.543368539971
FINAL MAE : 2621.4053304139206
FINAL R2  : 0.6196755438343526


In [None]:
import joblib

bundle = {
    "model": tuned_model,
    "features": ["Year", "Month", "IsSeason", "IsWeekend", "VehicleEncoded"],
    "label_encoder": le_vehicle,
    "vehicle_classes": list(le_vehicle.classes_),
    "metrics": {"rmse": float(final_rmse), "mae": float(final_mae), "r2": float(final_r2)},
    "random_state": 100,
    "target": "TotalPriceLKR"
}

joblib.dump(bundle, "../models/best_vehicle_revenue_model.pkl")
print("Saved -> ../models/best_vehicle_revenue_model.pkl")

Saved -> ../models/best_vehicle_revenue_model.pkl


: 