In [0]:
# importing packages
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [0]:
# Prepare data

df = spark.table("ecommercetest.gold.products").toPandas()

df_clean = df.dropna(subset=["revenue"])

X = df_clean[["views", "brand"]]

X = pd.get_dummies(X, columns=["brand"], dummy_na=True)

y = df_clean["revenue"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=5, max_depth=5)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, "model")

        print(f"{name}: R² = {score:.4f}")



linear: R² = 0.0875




decision_tree: R² = 0.0858




random_forest: R² = 0.1089


In [0]:
# Spark ML Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression as SparkLR

brand_indexer = StringIndexer(
    inputCol="brand",
    outputCol="brand_idx",
    handleInvalid="keep"
)

assembler = VectorAssembler(
    inputCols=["views", "brand_idx"],  # X
    outputCol="features"
)

lr = SparkLR(
    featuresCol="features",
    labelCol="revenue"                  # y
)

pipeline = Pipeline(stages=[brand_indexer, assembler, lr])

spark_df = spark.table("ecommercetest.gold.products")
spark_df = spark_df.filter(spark_df["revenue"].isNotNull())

# Keep only top 2 brands to avoid model size overflow
from pyspark.sql import functions as F
brand_counts = spark_df.groupBy("brand").count().orderBy(F.desc("count")).limit(2)
top_brands = [row["brand"] for row in brand_counts.collect()]
spark_df = spark_df.filter(spark_df["brand"].isin(top_brands))

train, test = spark_df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)