In [0]:
import mlflow
mlflow.set_experiment("/Workspace/Users/keerthi.amulya.1999@gmail.com/Day-12")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/517399182210612', creation_time=1768928607309, experiment_id='517399182210612', last_update_time=1769010275293, lifecycle_stage='active', name='/Users/keerthi.amulya.1999@gmail.com/Day-12', tags={'mlflow.experiment.sourceName': '/Users/keerthi.amulya.1999@gmail.com/Day-12',
 'mlflow.experimentType': 'NOTEBOOK',
 'mlflow.ownerEmail': 'keerthi.amulya.1999@gmail.com',
 'mlflow.ownerId': '73807756678194'}>

In [0]:
df_spark = spark.table("ecommerce.gold.products")
df_spark.printSchema()
df_spark.show(5)

root
 |-- product_id: integer (nullable = true)
 |-- brand: string (nullable = true)
 |-- views: long (nullable = true)
 |-- purchases: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- conversion_rate: double (nullable = true)

+----------+-------+-----+---------+------------------+-------------------+
|product_id|  brand|views|purchases|           revenue|    conversion_rate|
+----------+-------+-----+---------+------------------+-------------------+
|   8500290|   NULL|  357|       12|           4071.73|  3.361344537815126|
|   3300488|redmond| 1718|       38| 6847.049999999998|  2.211874272409779|
|  12704683| nokian|  733|       29|3121.4700000000003|  3.956343792633015|
|   5100799| garmin| 2450|        5|22594.309999999998|0.20408163265306123|
|   1004573|samsung| 3216|       50|39517.170000000006|  1.554726368159204|
+----------+-------+-----+---------+------------------+-------------------+
only showing top 5 rows


In [0]:
import numpy as np
from sklearn.model_selection import train_test_split

pdf = df_spark.select("views", "purchases", "revenue", "conversion_rate").toPandas()
pdf = pdf.fillna(0)
X = pdf[["views", "purchases", "conversion_rate"]]
y = pdf["revenue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=6, random_state=42),
    "random_forest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}

results = []

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "views,purchases,conversion_rate")
        mlflow.log_param("target", "revenue")

        # log hyperparams if available
        if hasattr(model, "get_params"):
            for k, v in model.get_params().items():
                if k in ["max_depth", "n_estimators", "min_samples_split", "min_samples_leaf"]:
                    mlflow.log_param(k, v)

        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        r2 = r2_score(y_test, pred)
        rmse = np.sqrt(mean_squared_error(y_test, pred))

        mlflow.log_metric("r2", float(r2))
        mlflow.log_metric("rmse", float(rmse))

        mlflow.sklearn.log_model(model, "model")

        results.append((name, r2, rmse))
        print(f"{name}: R2={r2:.4f}, RMSE={rmse:.2f}")



linear: R2=0.6405, RMSE=43655.59




decision_tree: R2=0.9139, RMSE=21362.42




random_forest: R2=0.8177, RMSE=31091.45


In [0]:
rf_grid = [
    {"n_estimators": 200, "max_depth": 6},
    {"n_estimators": 300, "max_depth": 10},
    {"n_estimators": 500, "max_depth": None},
]

from sklearn.ensemble import RandomForestRegressor

for cfg in rf_grid:
    model = RandomForestRegressor(
        n_estimators=cfg["n_estimators"],
        max_depth=cfg["max_depth"],
        random_state=42,
        n_jobs=-1
    )

    with mlflow.start_run(run_name=f"rf_tune_{cfg['n_estimators']}_{cfg['max_depth']}"):
        mlflow.log_param("model_type", "random_forest")
        mlflow.log_param("features", "views,purchases,conversion_rate")
        mlflow.log_param("target", "revenue")
        mlflow.log_param("n_estimators", cfg["n_estimators"])
        mlflow.log_param("max_depth", str(cfg["max_depth"]))

        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        r2 = r2_score(y_test, pred)
        rmse = np.sqrt(mean_squared_error(y_test, pred))

        mlflow.log_metric("r2", float(r2))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.sklearn.log_model(model, "model")

        print(f"RF cfg={cfg} -> R2={r2:.4f}, RMSE={rmse:.2f}")



RF cfg={'n_estimators': 200, 'max_depth': 6} -> R2=0.8281, RMSE=30193.45




RF cfg={'n_estimators': 300, 'max_depth': 10} -> R2=0.8205, RMSE=30852.32




RF cfg={'n_estimators': 500, 'max_depth': None} -> R2=0.8242, RMSE=30532.22


In [0]:
import pandas as pd

best_rf = models["random_forest"]  
fi = pd.DataFrame({
    "feature": X.columns,
    "importance": best_rf.feature_importances_
}).sort_values("importance", ascending=False)

fi

Unnamed: 0,feature,importance
0,views,0.656576
1,purchases,0.22502
2,conversion_rate,0.118404


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.evaluation import RegressionEvaluator

spark_df = spark.table("ecommerce.gold.products") \
    .select("views","purchases","conversion_rate","revenue") \
    .na.fill(0)

train_s, test_s = spark_df.randomSplit([0.8, 0.2], seed=42)
assembler = VectorAssembler(
    inputCols=["views","purchases","conversion_rate"],
    outputCol="features"
)
lr = SparkLR(featuresCol="features", labelCol="revenue")
pipeline = Pipeline(stages=[assembler, lr])
spark_model = pipeline.fit(train_s)
pred_s = spark_model.transform(test_s)
pred_s.select("revenue","prediction").show(5)


+-------+-------------------+
|revenue|         prediction|
+-------+-------------------+
|  12.15|-3541.2404146436525|
|  17.37|-3541.2404146436525|
|  27.03|-3541.2404146436525|
| 116.06|-3541.2404146436525|
| 157.53|-3541.2404146436525|
+-------+-------------------+
only showing top 5 rows


In [0]:
with mlflow.start_run(run_name="spark_lr_pipeline"):
    mlflow.log_param("model_type", "spark_lr_pipeline")
    mlflow.log_param("features", "views,purchases,conversion_rate")
    mlflow.log_param("target", "revenue")
    mlflow.log_metric("r2", float(r2_s))
    mlflow.log_metric("rmse", float(rmse_s))


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-7606715851453805>, line 5[0m
[1;32m      3[0m mlflow[38;5;241m.[39mlog_param([38;5;124m"[39m[38;5;124mfeatures[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mviews,purchases,conversion_rate[39m[38;5;124m"[39m)
[1;32m      4[0m mlflow[38;5;241m.[39mlog_param([38;5;124m"[39m[38;5;124mtarget[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mrevenue[39m[38;5;124m"[39m)
[0;32m----> 5[0m mlflow[38;5;241m.[39mlog_metric([38;5;124m"[39m[38;5;124mr2[39m[38;5;124m"[39m, [38;5;28mfloat[39m(r2_s))
[1;32m      6[0m mlflow[38;5;241m.[39mlog_metric([38;5;124m"[39m[38;5;124mrmse[39m[38;5;124m"[39m, [38;5;28mfloat[39m(rmse_s))

[0;31mNameError[0m: name 'r2_s' is not defined