<a href="https://colab.research.google.com/github/LAworkspace/retail-recommender-MLOPS/blob/main/2_model_training_rf_MLflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import polars as pl

# Load the dataset
events = pl.read_parquet("/content/enriched_events_with_conversion (1).parquet")

# Preview
print(events.head(5))
print(events.schema)


shape: (5, 14)
┌─────────────┬───────────┬───────┬────────┬───┬────────────┬────────────┬────────────┬────────────┐
│ timestamp   ┆ visitorid ┆ event ┆ itemid ┆ … ┆ views_last ┆ cart_conve ┆ purchase_c ┆ label_inte │
│ ---         ┆ ---       ┆ ---   ┆ ---    ┆   ┆ _24h       ┆ rsion_rate ┆ onversion_ ┆ rested     │
│ datetime[μs ┆ i64       ┆ str   ┆ i64    ┆   ┆ ---        ┆ ---        ┆ rate       ┆ ---        │
│ ]           ┆           ┆       ┆        ┆   ┆ u32        ┆ f64        ┆ ---        ┆ i8         │
│             ┆           ┆       ┆        ┆   ┆            ┆            ┆ f64        ┆            │
╞═════════════╪═══════════╪═══════╪════════╪═══╪════════════╪════════════╪════════════╪════════════╡
│ 2015-09-11  ┆ 0         ┆ view  ┆ 285930 ┆ … ┆ 0          ┆ 0.0        ┆ 0.0        ┆ 0          │
│ 20:49:49.43 ┆           ┆       ┆        ┆   ┆            ┆            ┆            ┆            │
│ 9           ┆           ┆       ┆        ┆   ┆            ┆            ┆  

In [2]:
# Group-level aggregations
agg = events.group_by(["visitorid", "sessionid", "itemid"]).agg([
    (pl.col("event") == "view").cast(pl.Int8).sum().alias("view_count"),
    (pl.col("event") == "cart").cast(pl.Int8).sum().alias("added_to_cart"),
    (pl.col("timestamp").max() - pl.col("timestamp").min())
        .dt.cast_time_unit("ms").alias("dwell_time_ms")
])

# Convert dwell time to seconds
agg = agg.with_columns((pl.col("dwell_time_ms") / 1000).cast(pl.Int32).alias("dwell_time_secs"))
agg = agg.drop("dwell_time_ms")

# Join back with enriched events
model_data = events.join(agg, on=["visitorid", "sessionid", "itemid"], how="left")


In [3]:
# Add hour_of_day, day_of_week
model_data = model_data.with_columns([
    pl.col("timestamp").dt.hour().alias("hour_of_day"),
    pl.col("timestamp").dt.weekday().alias("day_of_week"),
])

# Keep only last label per item-session
model_data = model_data.unique(subset=["visitorid", "sessionid", "itemid"], keep="last")


In [4]:
feature_cols = [
    "hour_of_day", "day_of_week", "view_count",
    "added_to_cart", "dwell_time_secs"
]
target_col = "label_interested"

df_model = model_data.select(feature_cols + [target_col]).to_pandas()
X = df_model[feature_cols]
y = df_model[target_col]


In [5]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [6]:
pip install mlflow lightgbm scikit-learn


Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.59.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading opentele

In [7]:
import mlflow
import mlflow.sklearn
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import time

mlflow.set_experiment("Product Interest Prediction")

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    return acc

def run_and_log_model(model_name, model, params):
    with mlflow.start_run(run_name=model_name):
        # Fit model
        start = time.time()
        model.fit(X_train, y_train)
        end = time.time()

        # Evaluate
        acc = evaluate_model(model, X_test, y_test)

        # Log params and metrics
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("training_time_secs", end - start)

        # Save model
        mlflow.sklearn.log_model(model, model_name.lower())

        print(f"✅ {model_name} logged successfully.\n")

# LightGBM
lgb_params = {
    "learning_rate": 0.05,
    "num_leaves": 31,
    "n_estimators": 300
}
lgb_model = lgb.LGBMClassifier(**lgb_params)
run_and_log_model("LightGBM", lgb_model, lgb_params)

# Random Forest
rf_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42
}
rf_model = RandomForestClassifier(**rf_params)
run_and_log_model("RandomForest", rf_model, rf_params)


2025/07/23 10:05:03 INFO mlflow.tracking.fluent: Experiment with name 'Product Interest Prediction' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 200677, number of negative: 1560212
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.193527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 1760889, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113963 -> initscore=-2.050880
[LightGBM] [Info] Start training from score -2.050880

Accuracy: 1.0000




              precision    recall  f1-score   support

           0       1.00      1.00      1.00    520072
           1       1.00      1.00      1.00     66892

    accuracy                           1.00    586964
   macro avg       1.00      1.00      1.00    586964
weighted avg       1.00      1.00      1.00    586964





✅ LightGBM logged successfully.


Accuracy: 1.0000




              precision    recall  f1-score   support

           0       1.00      1.00      1.00    520072
           1       1.00      1.00      1.00     66892

    accuracy                           1.00    586964
   macro avg       1.00      1.00      1.00    586964
weighted avg       1.00      1.00      1.00    586964





✅ RandomForest logged successfully.



In [8]:
!pip install -q mlflow pyngrok


In [9]:
from pyngrok import ngrok

# Paste your token here once
!ngrok config add-authtoken 30GS6qUkZzKbbyqA3WRvh1snYX8_5NH4U2gVBjZCgVDTHLUxj

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [10]:
import mlflow
import os
from pyngrok import ngrok

# Set up the MLflow tracking URI and artifact location
mlflow_port = 5000
mlflow_uri = f"http://127.0.0.1:{mlflow_port}"

# Use local file storage
os.environ["MLFLOW_TRACKING_URI"] = mlflow_uri
os.environ["MLFLOW_ARTIFACT_URI"] = "/content/mlruns"

# Start ngrok tunnel  # Optional if already set
public_url = ngrok.connect(mlflow_port)
print(f"📡 MLflow UI is available at: {public_url.public_url}")


📡 MLflow UI is available at: https://da6aa61b3562.ngrok-free.app


In [11]:
# Start MLflow UI in background
get_ipython().system_raw("mlflow ui --port 5000 &")


In [12]:
from pyngrok import ngrok

public_url = ngrok.connect(5000)
print(f"📡 MLflow UI is available at: {public_url.public_url}")


📡 MLflow UI is available at: https://e2e60ec3aeed.ngrok-free.app
