In [0]:
%sql
SELECT * FROM ecommerce.gold.products LIMIT 5;

product_id,brand,views,purchases,revenue,conversion_rate
8500290,,357,12,4071.73,3.361344537815126
3300488,redmond,1718,38,6847.049999999998,2.211874272409779
12704683,nokian,733,29,3121.4700000000003,3.956343792633015
5100799,garmin,2450,5,22594.31,0.2040816326530612
1004573,samsung,3216,50,39517.170000000006,1.554726368159204


In [0]:
df_spark = spark.table("ecommerce.gold.products")
df_spark.printSchema()
df_spark.show(5)

root
 |-- product_id: integer (nullable = true)
 |-- brand: string (nullable = true)
 |-- views: long (nullable = true)
 |-- purchases: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- conversion_rate: double (nullable = true)

+----------+-------+-----+---------+------------------+-------------------+
|product_id|  brand|views|purchases|           revenue|    conversion_rate|
+----------+-------+-----+---------+------------------+-------------------+
|   8500290|   NULL|  357|       12|           4071.73|  3.361344537815126|
|   3300488|redmond| 1718|       38| 6847.049999999998|  2.211874272409779|
|  12704683| nokian|  733|       29|3121.4700000000003|  3.956343792633015|
|   5100799| garmin| 2450|        5|22594.309999999998|0.20408163265306123|
|   1004573|samsung| 3216|       50|39517.170000000006|  1.554726368159204|
+----------+-------+-----+---------+------------------+-------------------+
only showing top 5 rows


In [0]:
pdf = df_spark.select("views", "purchases", "revenue", "conversion_rate").toPandas()

In [0]:
pdf = pdf.fillna(0)
pdf.head()

Unnamed: 0,views,purchases,revenue,conversion_rate
0,357,12,4071.73,3.361345
1,1718,38,6847.05,2.211874
2,733,29,3121.47,3.956344
3,2450,5,22594.31,0.204082
4,3216,50,39517.17,1.554726


In [0]:
X = pdf[["views", "purchases", "conversion_rate"]]
y = pdf["revenue"]

In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)

r2 = r2_score(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))
mae = mean_absolute_error(y_test, pred)

r2, rmse, mae

(0.6405423569176119, np.float64(43655.5908412909), 4633.991748139074)

In [0]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("/Workspace/Users/keerthi.amulya.1999@gmail.com/Day-12")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/517399182210612', creation_time=1768928607309, experiment_id='517399182210612', last_update_time=1768928607309, lifecycle_stage='active', name='/Users/keerthi.amulya.1999@gmail.com/Day-12', tags={'mlflow.experiment.sourceName': '/Users/keerthi.amulya.1999@gmail.com/Day-12',
 'mlflow.experimentType': 'NOTEBOOK',
 'mlflow.ownerEmail': 'keerthi.amulya.1999@gmail.com',
 'mlflow.ownerId': '73807756678194'}>

In [0]:
with mlflow.start_run(run_name="linreg_revenue_v1"):
    # Params
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("target", "revenue")
    mlflow.log_param("features", "views,purchases,conversion_rate")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Metrics
    mlflow.log_metric("r2_score", float(r2))
    mlflow.log_metric("rmse", float(rmse))
    mlflow.log_metric("mae", float(mae))

    # Model
    mlflow.sklearn.log_model(model, artifact_path="model")

print(f"R2={r2:.4f}, RMSE={rmse:.2f}, MAE={mae:.2f}")



R2=0.6405, RMSE=43655.59, MAE=4633.99


In [0]:
def train_and_log(model, run_name, extra_params=None):
    if extra_params is None:
        extra_params = {}

    with mlflow.start_run(run_name=run_name):
        # Params
        mlflow.log_param("model_name", model.__class__.__name__)
        mlflow.log_param("features", "views,purchases,conversion_rate")
        mlflow.log_param("target", "revenue")
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)

        for k, v in extra_params.items():
            mlflow.log_param(k, v)

        # Train
        model.fit(X_train, y_train)

        # Predict + Metrics
        pred = model.predict(X_test)
        r2 = r2_score(y_test, pred)
        rmse = np.sqrt(mean_squared_error(y_test, pred))
        mae = mean_absolute_error(y_test, pred)

        mlflow.log_metric("r2", float(r2))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_metric("mae", float(mae))

        # Log model
        mlflow.sklearn.log_model(model, "model")

    print(f"{run_name:<30} | R2={r2:.4f}  RMSE={rmse:.2f}  MAE={mae:.2f}")
    return r2, rmse, mae

In [0]:
from sklearn.ensemble import RandomForestRegressor

train_and_log(
    RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "random_forest_200",
    {"n_estimators": 200}
)



random_forest_200              | R2=0.8177  RMSE=31091.45  MAE=943.04


(0.8176734252150363, np.float64(31091.451384508437), 943.0427303195402)

In [0]:
from sklearn.ensemble import GradientBoostingRegressor

train_and_log(
    GradientBoostingRegressor(random_state=42),
    "03_gradient_boosting"
)



03_gradient_boosting           | R2=0.9026  RMSE=22726.16  MAE=907.54


(0.9025863079723264, np.float64(22726.157441088533), 907.5398742542875)

### With little feature engg

In [0]:
pdf["avg_price"] = np.where(pdf["purchases"] > 0, pdf["revenue"] / pdf["purchases"], 0.0)

In [0]:
X = pdf[["views", "purchases", "conversion_rate", "avg_price"]]
y = pdf["revenue"]

X.head(), y.head()

(   views  purchases  conversion_rate    avg_price
 0    357         12         3.361345   339.310833
 1   1718         38         2.211874   180.185526
 2    733         29         3.956344   107.636897
 3   2450          5         0.204082  4518.862000
 4   3216         50         1.554726   790.343400,
 0     4071.73
 1     6847.05
 2     3121.47
 3    22594.31
 4    39517.17
 Name: revenue, dtype: float64)

In [0]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# Model
gbr = GradientBoostingRegressor(random_state=42)

# CV setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# R2 CV
r2_scores = cross_val_score(gbr, X, y, cv=kf, scoring="r2")

# RMSE CV (sklearn returns negative for loss metrics)
rmse_scores = np.sqrt(-cross_val_score(
    gbr, X, y, cv=kf, scoring="neg_mean_squared_error"
))

r2_scores, rmse_scores

(array([0.96532224, 0.9067878 , 0.80872162, 0.88886974, 0.9366649 ]),
 array([13559.43326316, 19660.7666302 , 12967.41622225, 14846.32407507,
        16024.25995361]))

In [0]:
r2_mean, r2_std = float(r2_scores.mean()), float(r2_scores.std())
rmse_mean, rmse_std = float(rmse_scores.mean()), float(rmse_scores.std())

print(f"CV R2:   {r2_mean:.4f} ± {r2_std:.4f}")
print(f"CV RMSE: {rmse_mean:.2f} ± {rmse_std:.2f}")


CV R2:   0.9013 ± 0.0531
CV RMSE: 15411.64 ± 2372.97
