# Revenue Training Notebook
This notebook will contain the ML model training code for revenue prediction.

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
df = pd.read_csv("../data/rentals_daily_realistic.csv")

print("Shape:", df.shape)
df.head()

Shape: (1068, 6)


Unnamed: 0,Date,RevenueLKR,IsSeason,IsWeekend,Year,Month
0,1/1/2023,38500,1,1,2023,1
1,1/2/2023,52800,1,0,2023,1
2,1/3/2023,77550,1,0,2023,1
3,1/4/2023,108350,1,0,2023,1
4,1/5/2023,53350,1,0,2023,1


In [20]:
# Missing values summary
df.isna().sum()

# Optional: percent missing
(df.isna().mean() * 100).round(2)

Date          0.0
RevenueLKR    0.0
IsSeason      0.0
IsWeekend     0.0
Year          0.0
Month         0.0
dtype: float64

In [22]:
#Change date
df["Date"] = pd.to_datetime(df["Date"])

# Create time features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["IsWeekend"] = (df["Date"].dt.weekday >= 5).astype(int)

# Season: Oct–Mar
season_months = [10,11,12,1,2,3]
df["IsSeason"] = df["Month"].isin(season_months).astype(int)

df.head()

Unnamed: 0,Date,RevenueLKR,IsSeason,IsWeekend,Year,Month
0,2023-01-01,38500,1,1,2023,1
1,2023-01-02,52800,1,0,2023,1
2,2023-01-03,77550,1,0,2023,1
3,2023-01-04,108350,1,0,2023,1
4,2023-01-05,53350,1,0,2023,1


In [23]:
x = df[["Year", "Month", "IsSeason", "IsWeekend"]]
y = df["RevenueLKR"]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=100,
    shuffle=True
)

print("Train:", x_train.shape)
print("Test :", x_test.shape)

Train: (854, 4)
Test : (214, 4)


In [25]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred_lr = lr.predict(x_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr  = mean_absolute_error(y_test, y_pred_lr)
r2_lr   = r2_score(y_test, y_pred_lr)

print("Linear Regression")
print("RMSE:", rmse_lr)
print("MAE :", mae_lr)
print("R2  :", r2_lr)

Linear Regression
RMSE: 17073.693456088204
MAE : 12240.509903826218
R2  : 0.5344745665043173


In [26]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=100)
dt.fit(x_train, y_train)

y_pred_dt = dt.predict(x_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
mae_dt  = mean_absolute_error(y_test, y_pred_dt)
r2_dt   = r2_score(y_test, y_pred_dt)

print("Decision Tree")
print("RMSE:", rmse_dt)
print("MAE :", mae_dt)
print("R2  :", r2_dt)

Decision Tree
RMSE: 18076.58302824887
MAE : 12681.102904577965
R2  : 0.47817950282445687


In [27]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    random_state=100
)

rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf  = mean_absolute_error(y_test, y_pred_rf)
r2_rf   = r2_score(y_test, y_pred_rf)

print("Random Forest")
print("RMSE:", rmse_rf)
print("MAE :", mae_rf)
print("R2  :", r2_rf)

Random Forest
RMSE: 18034.521375018074
MAE : 12645.557867974865
R2  : 0.4806050826017664


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

knn.fit(x_train, y_train)

y_pred_knn = knn.predict(x_test)

rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
mae_knn  = mean_absolute_error(y_test, y_pred_knn)
r2_knn   = r2_score(y_test, y_pred_knn)

print("KNN")
print("RMSE:", rmse_knn)
print("MAE :", mae_knn)
print("R2  :", r2_knn)

KNN
RMSE: 19644.181131735895
MAE : 13712.14953271028
R2  : 0.38375087794631313


In [29]:
from sklearn.svm import SVR

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR())
])

svm.fit(x_train, y_train)

y_pred_svm = svm.predict(x_test)

rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))
mae_svm  = mean_absolute_error(y_test, y_pred_svm)
r2_svm   = r2_score(y_test, y_pred_svm)

print("SVM")
print("RMSE:", rmse_svm)
print("MAE :", mae_svm)
print("R2  :", r2_svm)

SVM
RMSE: 25966.35627776393
MAE : 19495.99819004119
R2  : -0.07673908247525918


In [34]:
results = pd.DataFrame({
    "Model": ["LinearRegression", "DecisionTree", "RandomForest", "KNN", "SVM"],
    "RMSE": [rmse_lr, rmse_dt, rmse_rf, rmse_knn, rmse_svm],
    "MAE":  [mae_lr,  mae_dt,  mae_rf,  mae_knn,  mae_svm],
    "R2":   [r2_lr,   r2_dt,   r2_rf,   r2_knn,   r2_svm]
}).sort_values("R2", ascending=False)

print(results)
best_name = results.iloc[0]["Model"]
print("\nBest model is -", best_name)

model_map = {
    "LinearRegression": lr,
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": knn,
    "SVM": svm
}

best_model = model_map[best_name]

              Model          RMSE           MAE        R2
0  LinearRegression  17073.693456  12240.509904  0.534475
2      RandomForest  18034.521375  12645.557868  0.480605
1      DecisionTree  18076.583028  12681.102905  0.478180
3               KNN  19644.181132  13712.149533  0.383751
4               SVM  25966.356278  19495.998190 -0.076739

Best model is - LinearRegression


In [35]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    best_model,
    x_train,
    y_train,
    cv=5,
    scoring="r2"
)

print("Cross Validation (R2) Scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())
print("Std CV R2 :", cv_scores.std())

Cross Validation (R2) Scores: [0.48182851 0.43150189 0.49613202 0.39835999 0.45689054]
Mean CV R2: 0.4529425892377027
Std CV R2 : 0.03508353671678032


In [36]:
from sklearn.model_selection import GridSearchCV

tuned_model = best_model  # default (if no tuning grid)

if best_name == "RandomForest":
    param_grid = {
        "n_estimators": [200, 400, 600],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=rf.__class__(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned RandomForest")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "DecisionTree":
    param_grid = {
        "max_depth": [None, 3, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=dt.__class__(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned DecisionTree")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "KNN":
    # Pipeline tuning: must use step name "model__"
    param_grid = {
        "model__n_neighbors": [3, 5, 7, 9, 11],
        "model__weights": ["uniform", "distance"]
    }

    grid = GridSearchCV(
        estimator=knn,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned KNN")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "SVM":
    # Pipeline tuning: must use step name "model__"
    param_grid = {
        "model__C": [0.1, 1, 10],
        "model__gamma": ["scale", "auto"],
        "model__epsilon": [0.1, 0.2, 0.5]
    }

    grid = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned SVM")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

else:
    print("No tuning grid for LinearRegression (kept as is).")

best_model_final = tuned_model

No tuning grid for LinearRegression (kept as is).


In [37]:
final_pred = best_model_final.predict(x_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae  = mean_absolute_error(y_test, final_pred)
final_r2   = r2_score(y_test, final_pred)

print("\n✅ FINAL MODEL:", best_name)
print("FINAL RMSE:", final_rmse)
print("FINAL MAE :", final_mae)
print("FINAL R2  :", final_r2)


✅ FINAL MODEL: LinearRegression
FINAL RMSE: 17073.693456088204
FINAL MAE : 12240.509903826218
FINAL R2  : 0.5344745665043173


In [38]:
import joblib

bundle = {
    "model": best_model_final,
    "features": ["Year", "Month", "IsSeason", "IsWeekend"],
    "metrics": {
        "rmse": float(final_rmse),
        "mae": float(final_mae),
        "r2": float(final_r2)
    },
    "random_state": 100,
    "split": "train_test_split 80/20"
}

joblib.dump(bundle, "../models/best_revenue_model.pkl")
print("✅ Saved -> ../models/best_revenue_model.pkl")

✅ Saved -> ../models/best_revenue_model.pkl
