# Revenue Training Notebook
This notebook will contain the ML model training code for revenue prediction.

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [50]:
df_txn = pd.read_csv("../data/transactions_regenerated_ROUNDED.csv")

print("Raw Shape:", df_txn.shape)

df_txn["Date"] = pd.to_datetime(df_txn["Date"]).dt.normalize()

df = (
    df_txn.groupby("Date", as_index=False)["RevenueLKR"]
    .sum()
    .sort_values("Date")
    .reset_index(drop=True)
)

print("Daily Shape:", df.shape)
df.head()

Raw Shape: (4721, 12)


KeyError: 'Column not found: RevenueLKR'

In [None]:
# Missing values summary
df.isna().sum()

# Optional: percent missing
(df.isna().mean() * 100).round(2)

Date          0.0
RevenueLKR    0.0
IsWeekend     0.0
Year          0.0
Month         0.0
IsSeason      0.0
dtype: float64

In [None]:
#Change date
df["Date"] = pd.to_datetime(df["Date"])

# Create time features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["IsWeekend"] = (df["Date"].dt.weekday >= 5).astype(int)

# Season: Oct–Mar
season_months = [10,11,12,1,2,3]
df["IsSeason"] = df["Month"].isin(season_months).astype(int)

df.head()

Unnamed: 0,Date,RevenueLKR,IsWeekend,Year,Month,IsSeason
0,2023-01-01,100500,1,2023,1,1
1,2023-01-02,60500,0,2023,1,1
2,2023-01-03,43500,0,2023,1,1
3,2023-01-04,79000,0,2023,1,1
4,2023-01-05,36000,0,2023,1,1


In [None]:
x = df[["Year", "Month", "IsSeason", "IsWeekend"]]
y = df["RevenueLKR"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=100,
    shuffle=True
)

print("Train:", x_train.shape)
print("Test :", x_test.shape)

Train: (854, 4)
Test : (214, 4)


In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred_lr = lr.predict(x_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr  = mean_absolute_error(y_test, y_pred_lr)
r2_lr   = r2_score(y_test, y_pred_lr)

print("Linear Regression")
print("RMSE:", rmse_lr)
print("MAE :", mae_lr)
print("R2  :", r2_lr)

Linear Regression
RMSE: 23309.472776198214
MAE : 16379.464290418187
R2  : 0.5641285529536744


In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=100)
dt.fit(x_train, y_train)

y_pred_dt = dt.predict(x_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
mae_dt  = mean_absolute_error(y_test, y_pred_dt)
r2_dt   = r2_score(y_test, y_pred_dt)

print("Decision Tree")
print("RMSE:", rmse_dt)
print("MAE :", mae_dt)
print("R2  :", r2_dt)

Decision Tree
RMSE: 21305.793000218517
MAE : 14342.230836218732
R2  : 0.635842779413989


In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    random_state=100
)

rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf  = mean_absolute_error(y_test, y_pred_rf)
r2_rf   = r2_score(y_test, y_pred_rf)

print("Random Forest")
print("RMSE:", rmse_rf)
print("MAE :", mae_rf)
print("R2  :", r2_rf)

Random Forest
RMSE: 21265.280324169264
MAE : 14323.08855888586
R2  : 0.6372263427678198


In [None]:
#KNN Regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

knn.fit(x_train, y_train)

y_pred_knn = knn.predict(x_test)

rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
mae_knn  = mean_absolute_error(y_test, y_pred_knn)
r2_knn   = r2_score(y_test, y_pred_knn)

print("KNN")
print("RMSE:", rmse_knn)
print("MAE :", mae_knn)
print("R2  :", r2_knn)

KNN
RMSE: 22448.595485070695
MAE : 14900.934579439252
R2  : 0.5957296713354975


In [None]:
#SVM Regression
from sklearn.svm import SVR

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVR())
])

svm.fit(x_train, y_train)

y_pred_svm = svm.predict(x_test)

rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))
mae_svm  = mean_absolute_error(y_test, y_pred_svm)
r2_svm   = r2_score(y_test, y_pred_svm)

print("SVM")
print("RMSE:", rmse_svm)
print("MAE :", mae_svm)
print("R2  :", r2_svm)

SVM
RMSE: 36404.78039745168
MAE : 24129.655236935567
R2  : -0.06318869585618403


In [None]:
results = pd.DataFrame({
    "Model": ["LinearRegression", "DecisionTree", "RandomForest", "KNN", "SVM"],
    "RMSE": [rmse_lr, rmse_dt, rmse_rf, rmse_knn, rmse_svm],
    "MAE":  [mae_lr,  mae_dt,  mae_rf,  mae_knn,  mae_svm],
    "R2":   [r2_lr,   r2_dt,   r2_rf,   r2_knn,   r2_svm]
}).sort_values("R2", ascending=False)

print(results)
best_name = results.iloc[0]["Model"]
print("\nBest model is -", best_name)

model_map = {
    "LinearRegression": lr,
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": knn,
    "SVM": svm
}

best_model = model_map[best_name]

              Model          RMSE           MAE        R2
2      RandomForest  21265.280324  14323.088559  0.637226
1      DecisionTree  21305.793000  14342.230836  0.635843
3               KNN  22448.595485  14900.934579  0.595730
0  LinearRegression  23309.472776  16379.464290  0.564129
4               SVM  36404.780397  24129.655237 -0.063189

Best model is - RandomForest


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    best_model,
    x_train,
    y_train,
    cv=5,
    scoring="r2"
)

print("Cross Validation (R2) Scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())
print("Std CV R2 :", cv_scores.std())

Cross Validation (R2) Scores: [0.62096582 0.69332887 0.64825743 0.53618584 0.61787913]
Mean CV R2: 0.6233234178416207
Std CV R2 : 0.05128474145399407


In [None]:
from sklearn.model_selection import GridSearchCV

tuned_model = best_model  # default (if no tuning grid)

if best_name == "RandomForest":
    param_grid = {
        "n_estimators": [200, 400, 600],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=rf.__class__(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned RandomForest")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "DecisionTree":
    param_grid = {
        "max_depth": [None, 3, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }

    grid = GridSearchCV(
        estimator=dt.__class__(random_state=100),
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned DecisionTree")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "KNN":
    # Pipeline tuning: must use step name "model__"
    param_grid = {
        "model__n_neighbors": [3, 5, 7, 9, 11],
        "model__weights": ["uniform", "distance"]
    }

    grid = GridSearchCV(
        estimator=knn,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned KNN")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

elif best_name == "SVM":
    # Pipeline tuning: must use step name "model__"
    param_grid = {
        "model__C": [0.1, 1, 10],
        "model__gamma": ["scale", "auto"],
        "model__epsilon": [0.1, 0.2, 0.5]
    }

    grid = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    tuned_model = grid.best_estimator_

    print("✅ Tuned SVM")
    print("Best Params:", grid.best_params_)
    print("Best CV R2 :", grid.best_score_)

else:
    print("No tuning grid for LinearRegression (kept as is).")

best_model_final = tuned_model

✅ Tuned RandomForest
Best Params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best CV R2 : 0.6468442611478649


In [None]:
final_pred = best_model_final.predict(x_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
final_mae  = mean_absolute_error(y_test, final_pred)
final_r2   = r2_score(y_test, final_pred)

print("\n✅ FINAL MODEL:", best_name)
print("FINAL RMSE:", final_rmse)
print("FINAL MAE :", final_mae)
print("FINAL R2  :", final_r2)


✅ FINAL MODEL: RandomForest
FINAL RMSE: 20404.112108971833
FINAL MAE : 13452.047040525282
FINAL R2  : 0.6660134933446524


In [None]:
import joblib

bundle = {
    "model": best_model_final,
    "features": ["Year", "Month", "IsSeason", "IsWeekend"],
    "metrics": {
        "rmse": float(final_rmse),
        "mae": float(final_mae),
        "r2": float(final_r2)
    },
    "random_state": 100,
    "split": "train_test_split 80/20"
}

joblib.dump(bundle, "../models/best_revenue_model.pkl")
print("✅ Saved -> ../models/best_revenue_model.pkl")

✅ Saved -> ../models/best_revenue_model.pkl
